# HW03: Distance and Topic Model

Remember that these homework work as a completion grade. **You can skip one section without losing credit.**

## Load and Pre-process Text

In [31]:
!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

--2024-03-14 20:49:33--  https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29470338 (28M) [text/plain]
Saving to: ‘train.csv.1’


2024-03-14 20:49:34 (277 MB/s) - ‘train.csv.1’ saved [29470338/29470338]



In [32]:
#Import the AG news dataset (same as hw01)
#Download them from here

import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label)
df["text"] = df["title"] + " " + df["lead"]
df.head()

Unnamed: 0,label,title,lead,text
0,business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
1,business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
2,business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
3,business,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."
4,business,"Stocks End Up, But Near Year Lows (Reuters)",Reuters - Stocks ended slightly higher on Frid...,"Stocks End Up, But Near Year Lows (Reuters) Re..."


In [33]:
import spacy
dfs = df.sample(200)
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import TfidfVectorizer

##TODO pre-process text as you did in HW02

def clean(x):
    # lemmatize and lowercase without stopwords, punctuation and numbers
    res = []
    for sent in x.sents:
      res.append([w.lemma_.lower() for w in sent if not w.is_stop and not w.is_punct and not w.is_digit])
    return res

def flatten(x):
  return [z for y in x for z in y]

def join_tokens(x):
  return " ".join(x)

dfs['doc'] = dfs['text'].apply(nlp)
dfs['unjoined_tokens'] = dfs['doc'].apply(clean).apply(flatten)
dfs['tokens'] = dfs['unjoined_tokens'].apply(join_tokens)

print(dfs.iloc[0]['doc'])
print(dfs.iloc[0]['tokens'])

Sudanese opposition, government start peace talks Sudanese opposition groups and a government delegation on Friday held the first full day of peace talks in Cairo with both sides optimistic about a deal, sources close to the negotiations told Egypt #39;s MENA news agency.
sudanese opposition government start peace talk sudanese opposition group government delegation friday hold day peace talk cairo side optimistic deal source close negotiation tell egypt 39;s mena news agency


In [34]:
##TODO vectorize the pre-processed text using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.01,
                        max_df=0.9,
                        max_features=1000,
                        stop_words='english',
                        use_idf=True,
                        ngram_range=(1,2))

tokens = dfs['tokens']

tfidf_matrix = tfidf.fit_transform(tokens)

## Cosine Similarity and PCA

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
##TODO compute the cosine similarity for the first 200 snippets and for the first snippet, show the three most similar snippets and their respective cosine similarity scores
cosine_sim = cosine_similarity(tfidf_matrix)

first_sim = cosine_sim[0]

cosine_similarity_of_others = first_sim[1:]
top_three_similar = cosine_similarity_of_others.argsort()[-3:]
idxs = list(reversed([idx + 1 for idx in top_three_similar]))
similarity_scores = cosine_sim[0, idxs]

print("-"*10)
print("original: "+dfs.iloc[0]['text'])
for (idx,score) in zip(idxs, similarity_scores):
  print("-"*10)
  print('snippet: ' + dfs.iloc[idx]['text'])
  print('tokens: ' + dfs.iloc[idx]['tokens'])
  print('cosine similarity: ' + str(score))
print("-"*10)


----------
original: Sudanese opposition, government start peace talks Sudanese opposition groups and a government delegation on Friday held the first full day of peace talks in Cairo with both sides optimistic about a deal, sources close to the negotiations told Egypt #39;s MENA news agency.
----------
snippet: Darfur Peace Talks Overshadowed by Fighting (Reuters) Reuters - Peace talks on Sudan's war-torn Darfur\region got under way Monday in Nigeria's capital, overshadowed\by fighting between rebels and government forces in the vast\desert area.
tokens: darfur peace talks overshadow fighting reuters reuters peace talk sudan war tear darfur\region get way monday nigeria capital overshadowed\by fighting rebel government force vast\desert area
cosine similarity: 0.3405044556564531
----------
snippet: Ukraine's Isolated PM Repeats Offer to Opposition (Reuters) Reuters - Prime Minister Viktor Yanukovich, looking\increasingly isolated in Ukraine's deepening political crisis,\offered again 

In [36]:
from sklearn.decomposition import KernelPCA
##TODO reduce the vectorized data using PCA

pca = KernelPCA(n_components=3) #pca would not work for me due to sparcity so i used kernelPCA
reduced_matrix = pca.fit_transform(tfidf_matrix)

##TODO compute again cosine similarity with the reduced version for the first 200 snippets

cosine_sim = cosine_similarity(reduced_matrix)

##TODO for the first snippet, show again its three most similar snippets

first_sim = cosine_sim[0]

cosine_similarity_of_others = first_sim[1:]
top_three_similar = cosine_similarity_of_others.argsort()[-3:]
idxs = list(reversed([idx + 1 for idx in top_three_similar]))
similarity_scores = cosine_sim[0, idxs]

print("-"*10)
print("original: "+dfs.iloc[0]['text'])
for (idx,score) in zip(idxs, similarity_scores):
  print("-"*10)
  print('snippet: ' + dfs.iloc[idx]['text'])
  print('tokens: ' + dfs.iloc[idx]['tokens'])
  print('cosine similarity: ' + str(score))
print("-"*10)

----------
original: Sudanese opposition, government start peace talks Sudanese opposition groups and a government delegation on Friday held the first full day of peace talks in Cairo with both sides optimistic about a deal, sources close to the negotiations told Egypt #39;s MENA news agency.
----------
snippet: Expanded African force, peace talks key to Darfur strategy UNITED NATIONS : An expanded African force must be urgently deployed in Sudan #39;s troubled Darfur region, UN Secretary General Kofi Annan said in a report discussed by the UN Security Council.
tokens: expand african force peace talk key darfur strategy united nations expand african force urgently deploy sudan 39;s troubled darfur region un secretary general kofi annan say report discuss un security council
cosine similarity: 0.9994113598840985
----------
snippet: Forstmann Little to buy Talent Agency IMG Buyout firm Forstmann Little  amp; Co. Thursday said it would buy International Management Group, a leading talent 

Compare the cosine similarity before and after PCA reduction. Did the results change?

When using a low number of components they completely change. When you increase the PCA to a higher number of components you re-obtain the original results.

## Clustering

In [None]:
!pip install scikit-learn-extra



In [None]:
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

##TODO create the clusters found with k-medoids clustering nd 5 clusters
##TODO create the clusters found with k-means clustering nd 5 clusters

seed = 42

# K-Medoids Clustering
kmedoids = KMedoids(n_clusters = 5, random_state = seed)
kmedoids_labels = kmedoids.fit_predict(tfidf_matrix)

# K-Means Clustering
kmeans = KMeans(n_clusters= 5, n_init = 1, random_state = seed )
kmeans_labels = kmeans.fit_predict(tfidf_matrix)

In [None]:
##TODO visualize the k-medoids clustering results through Isomap (first reduce the dimensionality to 2 using Isomap, then use scatter plot (plt.scatter()) to visualize)
from sklearn.manifold import Isomap
import matplotlib.pyplot as plt

isomap = Isomap(n_components=2)
reduced_data = isomap.fit_transform(tfidf_matrix)

plt.figure(figsize=(10, 7))
colors = ['red', 'green', 'blue', 'orange', 'purple']

for cluster_id in range(5):
    cluster_data = reduced_data[kmedoids_labels == cluster_id]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], c=colors[cluster_id], label=f"Cluster {cluster_id}")

plt.title('K-Medoids Clustering Visualization (Isomap)')
plt.legend()
plt.show()

In [None]:
##TODO similarly, visualize the k-means results
isomap = Isomap(n_components=2)
reduced_data = isomap.fit_transform(tfidf_matrix)

# Scatter Plot Visualization (Slightly modified)
plt.figure(figsize=(10, 7))

for cluster_id in range(5):
    cluster_data = reduced_data[kmeans_labels == cluster_id]  # Update for K-Means labels
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], c=colors[cluster_id], label=f"Cluster {cluster_id}")

plt.title('K-Means Clustering Visualization (Isomap)')
plt.legend()
plt.show()


## Topic Modeling: LDA

For this part you will need to use LDA Mallet. If you cannot have Mallet run, you can use the simple LDA algorithm

In [None]:
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

##TODO create a dictionary with the pre-processed tokenized text and filter it according to frequencies and keeping 1000 vocabularies

dictionary = Dictionary(dfs['unjoined_tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)

##TODO create the doc_term_matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in dfs['unjoined_tokens']]


In [None]:
##TODO train a LDA Mallet model with 5, 10 and 15 topics
##TODO compute the coherence score for each of these model and print the topics from the model with highest coherence score. Then find the opitimal number of topics using the coherence score.
#i couldn't get mallet to work in colab

In [None]:
#!pip install pyLDAvis
import pyLDAvis.gensim
from gensim.models import wrappers
##TODO using LDAvis visualize the topics using the optimal number of topics