In [None]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
df = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Train.csv')

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df['Text']).toarray()  
pca = PCA(n_components=2)
X_pca = pca.fit_transform(features)

fig_2d = px.scatter(
    x=X_pca[:, 0], y=X_pca[:, 1], 
    color=df['Category'],  
    labels={'x': 'Principal Component 1', 'y': 'Principal Component 2'},
    title="PCA Projection: High Dimensionality to 2D"
)
fig_2d.show()


pca = PCA(n_components=3)
X_pca = pca.fit_transform(features)

# 3D Plot
fig_3d = px.scatter_3d(
    x=X_pca[:, 0], y=X_pca[:, 1], z=X_pca[:, 2], 
    color=df['Category'],  
    labels={'x': 'Principal Component 1', 'y': 'Principal Component 2', 'z': 'Principal Component 3'},
    title="PCA Projection: High Dimensionality to 3D"
)
fig_3d.show()


In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Train.csv')

# Check for missing values in the dataset
missing_values = df.isnull().sum()
print(missing_values)
missing_percentage = (df.isnull().sum() / len(df)) * 100
print(missing_percentage)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

train_df = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Train.csv')

texts = train_df['Text']

tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

X = tfidf.fit_transform(texts)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Tune max_features for dimension reduction
X_tfidf = vectorizer.fit_transform(train_df['Text'])

inertia = []
cluster_range = range(1, 11)

for num_clusters in cluster_range:
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X_tfidf)
    inertia.append(kmeans.inertia_)

plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

optimal_clusters = 3  
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
train_df['Cluster'] = kmeans.fit_predict(X_tfidf)

print(train_df[['Text', 'Cluster']].head())



In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

agg_clust = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average')
train_df['Cluster'] = agg_clust.fit_predict(X_tfidf.toarray())

sch.dendrogram(sch.linkage(X_tfidf.toarray(), method='ward'))
plt.title('Dendrogram for Agglomerative Clustering')
plt.show()

print(train_df[['Text', 'Cluster']].head())

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

document_idx = 150  
input_document_tfidf = X_tfidf[document_idx]

similarities = cosine_similarity(input_document_tfidf, X_tfidf)

similar_docs_idx = similarities.argsort()[0][-6:-1]

print("Most similar articles to the selected document:")
for idx in similar_docs_idx:
    print(f"\nArticle (Cluster {train_df.iloc[idx]['Cluster']}): {train_df.iloc[idx]['Text']}\n")


This dataset was tricky to work with as the multidimensional nature of full text documents made it less precise to guage similarities between documents in clusters. The 3D PCA graph at least gave me an initial guide that showed that the categories were indeed distinct, although they unfortunately all converged at the center on both the 2d and 3d graphs which I believe is what made the delineation of the data so difficult. I tried to use the elbow method to fix the difficulties I was having with k-means clustering but the graph showed absolutely no clear plateau to cut off with. It was then that I realized a hierarchical clustering model would be much more efficient with the type of data that I was working with, and the dendrogram gave me 6 clear categories, which was different from the 5 initial categories but it was certainly a very clear separation. Although this model doesn't do what the initial challenge set out for this kind of approach doesn't lend itself to what is better with supervised learning, so I applied it to find similar articles which utilises the nature of the model well.