# Import Libraries


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import tkinter as tk
from tkinter.scrolledtext import ScrolledText
%pip install datasets
from datasets import load_dataset

Note: you may need to restart the kernel to use updated packages.


In [10]:
# data = {
#     'Title': [
#         'Uefa Opens Proceedings against Barcelona, Juventus in Super League Row',
#         'Amazon Blames Inflation as It Increases Cost of Prime Subscription',
#         'Nigeria’s Parliament Passes Amended Electoral Bill amid Controversy',
#         'Nigeria: Lagos Governor Tests Positive for Covid-19',
#         'South Africa Calls For Calm as Electoral Reform Talks Stall'
#     ],
#     'Excerpt': [
#         'Uefa has opened disciplinary proceedings against Barcelona, Juventus and Real Madrid over their involvement in the proposed breakaway Super League.',
#         'The increases are steeper than the 17 percent jump it implemented last year.',
#         "Nigeria's Senate on Tuesday passed the harmonised Electoral Bill amid controversy.",
#         'The Lagos State Governor, Mr. Babajide Sanwo-Olu, has tested positive for COVID-19.',
#         'South Africa has raised concerns about the deterioration of the political situation in Lesotho and called for calm.'
#     ],
#     'Category': ['sports', 'business', 'politics', 'health', 'politics']
# }

# Loading Dataset


In [11]:
dataset = load_dataset("okite97/news-data")
news_df = pd.DataFrame(dataset['train'])
# Filter data for only 'health', 'sports', and 'business' categories
# news_df = news_df[news_df['Category'].isin(['health', 'sports', 'business'])]

# Data Cleaning


In [12]:
# Drop rows with missing values in 'Title' or 'Excerpt' columns
news_df = news_df.dropna(subset=['Title', 'Excerpt'])
print(news_df.head())

                                               Title  \
0  Uefa Opens Proceedings against Barcelona, Juve...   
1  Amazon Blames Inflation as It Increases Cost o...   
2  Nigeria’s Parliament Passes Amended Electoral ...   
3  Nigeria: Lagos Governor Tests Positive for Cov...   
4  South Africa Calls For Calm as Electoral Refor...   

                                             Excerpt  Category  
0  Uefa has opened disciplinary proceedings again...    sports  
1  The increases are steeper than the 17 percent ...  business  
2  Nigeria's Senate on Tuesday passed the harmoni...  politics  
3  The Lagos State Governor, Mr. Babajide Sanwo-O...    health  
4  South Africa has raised concerns about the det...  politics  


# Vectorize the Text


In [13]:
# Vectorize text  
text_vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, 
                                  stop_words='english', use_idf=True)
tfidf_news = text_vectorizer.fit_transform(news_df['Title'] + ' ' + news_df['Excerpt'])


# Cluster the Data:


In [14]:
# Clustering
n_clusters = 3
news_clusters = KMeans(n_clusters=n_clusters, random_state=42)
news_clusters.fit(tfidf_news)



# Predict Cluster:


In [15]:
# Predict cluster
def predict_news_cluster(news_text):
    input_vector = text_vectorizer.transform([news_text])
    predicted_cluster = news_clusters.predict(input_vector)[0]
    predicted_category = news_df.iloc[predicted_cluster]['Category']
    return predicted_cluster, predicted_category


# Display Cluster Documents


In [16]:
window = tk.Tk()
window.title("News Clustering")
window.minsize(600, 400) 

news_textbox = ScrolledText(window)
news_textbox.grid(row=0, column=0, padx=5, pady=5, sticky="nsew")

predicted_cluster_label = tk.Label(window, text="Predicted Cluster:")
predicted_cluster_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")

predicted_category_label = tk.Label(window, text="Predicted Category:")  
predicted_category_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")

def predict_and_display():
    news_text = news_textbox.get("1.0", tk.END)   
    predicted_cluster, predicted_category = predict_news_cluster(news_text)
    predicted_cluster_label.config(text=f"Predicted Cluster: {predicted_cluster}") 
    predicted_category_label.config(text=f"Predicted Category: {predicted_category}")
    
    print("Predicted Cluster:", predicted_cluster)
    print("Predicted Category:", predicted_category)
      
# Tkinter GUI components
predict_button = tk.Button(window, text="Cluster News", command=predict_and_display)
predict_button.grid(row=3, column=0, padx=5, pady=5)
   
window.mainloop()

Predicted Cluster: 2
Predicted Category: politics
