In [1]:
from sklearn.cluster import KMeans
import numpy as np 
import pandas as pd
from sklearn.metrics import silhouette_score, pairwise_distances
import matplotlib.pyplot as plt
import ast


In [2]:
cnn = pd.read_csv(r"D:\Personal\Data Science\datathon2022-challenge\data\cnn_samples.csv")
federated = pd.read_csv(r"D:\Personal\Data Science\datathon2022-challenge\data\federal_samples.csv")
challenge = pd.read_csv(r"D:\Personal\Data Science\datathon2022-challenge\data\challenge.csv")

In [3]:
cnn_embeddings = cnn['embedding'].apply(lambda x: ast.literal_eval(x)).tolist()
federated_embeddings = federated['embeddings'].apply(lambda x: ast.literal_eval(x)).tolist()

In [None]:
all_embeddings = cnn_embeddings + federated_embeddings
all_embeddings 

In [5]:
data = np.array(all_embeddings)

inertia = []
silhouette_scores = []
cluster_range = range(2, 21)

In [6]:
for k in cluster_range:
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=300, random_state=42)
    kmeans.fit(data)
    inertia.append(kmeans.inertia_)  # Sum of squared distances to the nearest cluster center
    silhouette_avg = silhouette_score(data, kmeans.labels_)
    silhouette_scores.append(silhouette_avg)

In [None]:
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.plot(range(2, 21), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

In [None]:
plt.subplot(1, 2, 2)
plt.plot(cluster_range, silhouette_scores, marker='o', color='r')
plt.title('Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')

In [None]:
optimal_k = cluster_range[silhouette_scores.index(max(silhouette_scores))]
kmeans_optimal = KMeans(n_clusters=optimal_k, n_init=10, max_iter=300, random_state=42)
kmeans_optimal.fit(data)

In [None]:
data.shape

In [None]:
cluster_labels = kmeans_optimal.labels_
print(f"Optimal number of clusters: {optimal_k}")
print("Cluster Labels:", cluster_labels)

In [12]:
centroids = kmeans_optimal.cluster_centers_
distance = np.zeros(data.shape[0])

for i, label in enumerate(cluster_labels):
    centroid = centroids[label]
    distance[i] = np.linalg.norm(data[i] - centroid)

In [13]:
cnn['cluster'] = cluster_labels[:len(cnn_embeddings)]
federated['cluster'] = cluster_labels[len(cnn_embeddings):]

cnn['distance'] = distance[:len(cnn_embeddings)]
federated['distance'] = distance[len(cnn_embeddings):]

In [14]:
temp_cnn = cnn.drop(['embedding'], axis=1)
temp_federated = federated.drop(['embeddings'], axis=1)

In [15]:
temp_cnn.to_csv(r"D:\Personal\Data Science\datathon2022-challenge\result\cnn.csv", index=False)
temp_federated.to_csv(r"D:\Personal\Data Science\datathon2022-challenge\result\federal.csv", index=False)

In [17]:
challenge_cluster_label = kmeans_optimal.predict(challenge['embeddings'].apply(lambda x: ast.literal_eval(x)).tolist())
challenge['cluster'] = challenge_cluster_label
challenge.to_csv(r"D:\Personal\Data Science\datathon2022-challenge\result\challenge.csv", index=False)

In [33]:
#Educated Guess
## 0 - Wealth wellfare religion
## 1 - Crime
## 2 - Weather
## 3 - American Politics
## 4 - SEC and Law
## 5 - Safety and Emplyment
## 6 - Innovation, Social Media and Digital Economy
## 7 - Foreign Affairs and sports
## 8 - Disease

In [19]:
from dotenv import load_dotenv
import os
from openai import OpenAI as OpenAIClient
load_dotenv()
openai_key = os.getenv('OPENAI_API_KEY')
openai_client = OpenAIClient(api_key=openai_key)

In [28]:
def news_section(response):
    completion = openai_client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "you will be provided with a news article. You need to guess to which section in the newspaper it would go. Please reply with only the section name"},
    {"role": "user", "content": response}
  ],
  temperature = 0.4
    )
    return completion.choices[0].message.content

In [25]:
joined_data = pd.concat([cnn[['text','cluster','distance']], federated[['text','cluster','distance']]])
joined_data.sort_values(by=['cluster','distance'], inplace=True)
filtered_data = joined_data.groupby('cluster').head(10)

In [38]:
result = []
for idx, row in filtered_data.iterrows():
        section_name = news_section(row['text'])
        row['section_name'] = section_name
        result.append([row['text'], row['cluster'], row['section_name']])

result_df = pd.DataFrame(result, columns=['text', 'cluster', 'section_name'])

result_df.to_csv(r"D:\Personal\Data Science\datathon2022-challenge\result\result.csv", index=False)

In [55]:
result_df.groupby( ['cluster', 'section_name']).count().sort_values(['cluster','text'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,text
cluster,section_name,Unnamed: 2_level_1
8,Health,6
8,Entertainment,1
8,Human Interest,1
8,National News,1
8,Sports,1
7,Sports,8
7,Arts & Culture,1
7,Entertainment,1
6,Technology,7
6,Entertainment/Technology,1


In [None]:
Fair enough,, I won