# Google Drive Setup

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')
# NOTE: To be able to access the shared files, you need to go to Drive and click
# "Add shortcut to Drive" on the options for the shared folder to be able to access it when mounted

Mounted at /content/gdrive


In [3]:
# Individual Google Drive Paths to Folder

path_andrew = "/content/gdrive/MyDrive/CSE 6242"


In [4]:
path = path_andrew

%cd {path}
!dir

/content/gdrive/.shortcut-targets-by-id/1M3TUa6bdBADkSFtLqzmA1rk0ROvVyvdl/CSE 6242
Bias\ Classification		      Keyword\ Extraction\ Clustering\ Summaries.gdoc
Bias\ Classifier\ Notes.gdoc	      Keyword\ Extraction.gdoc
clustered_articles_with_keywords.csv  keywords\ (3).csv
dataset_with_keywords.csv	      Midterm\ Report\ Notes.gdoc
Data\ Viz\ Project\ Ideas.gdoc	      Presentation\ Script.gdoc
final\ clustered\ articles	      Project\ Outline.gdoc
Final\ Poster.gslides		      Project\ Proposal.gdoc
Final\ Report.gdoc		      Proposal\ Presentation.gslides
Keyword\ Clustering		      User\ Survey.gform
Keyword\ Extraction		      Website\ Portion\ of\ Project.gdoc


# Imports

In [5]:
%pip install kneed

Collecting kneed
  Downloading kneed-0.8.5-py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.5


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader
import html
import pprint

from tqdm import tqdm

# Create Ketyword Mapping

Get dictionary mapping of keyword -> article index so later re-assign clusters based on keywords.

In [7]:
keywords_df = pd.read_csv("keywords (3).csv")

In [8]:
keywords_df["Text"] = keywords_df["Text"].str.replace(r'<[^<>]*>', '', regex=True).apply(html.unescape)
keywords_df["Text"] = keywords_df["Text"].str.replace('\n', '')
del keywords_df["Entry Index"]
display(keywords_df)

Unnamed: 0,Keywords,Title,Text,Bias
0,"sexual, ranco, also, assault, attorney",After DeVos Announced Plans To Reexamine Title...,When explaining her decision to reevaluate Tit...,0
1,"degree, martin, trayvon, university, science",University To Award Trayvon Martin With Posthu...,A Florida university will honor Trayvon Martin...,0
2,"greek, university, texas, said, fraternity",Texas State University suspends Greek life aft...,Nov. 15 (UPI) — Texas State University has sus...,1
3,"jewish, day, unity, acheinu, people",Jewish Organization's Huge Day Of Unity On Tue...,Against the backdrop of an increasingly polari...,0
4,"trump, carrier, jobs, indiana, company","BREAKING: Trump Reaches Agreement To Keep 1,00...",President-elect Donald Trump has reached an ag...,0
...,...,...,...,...
599879,"boyega, said, finn, hes, jedi","John Boyega talks 'Last Jedi,' personal parall...",LOS ANGELES (AP) - John Boyega may have a lot ...,2
599880,"climate, it8217s, pacific, nations, 8220we",Sinking Feeling: More Bad News for Pacific Isl...,Kiribati lidian/Shutterstock In climate negot...,4
599881,"burgers, burger, hamburger, order, hut","Like home: Big, juicy burgers made the old-fas...",.......... .......... .......... .......... .....,2
599882,"new, tax, bill, jersey, york",Decision Time for NY and NJ Republicans on Tax...,It's an important week for the Republican-led ...,3


In [9]:
keywords_df['Keywords'] = keywords_df.Keywords.apply(lambda x: x.split(', '))
display(keywords_df)

Unnamed: 0,Keywords,Title,Text,Bias
0,"[sexual, ranco, also, assault, attorney]",After DeVos Announced Plans To Reexamine Title...,When explaining her decision to reevaluate Tit...,0
1,"[degree, martin, trayvon, university, science]",University To Award Trayvon Martin With Posthu...,A Florida university will honor Trayvon Martin...,0
2,"[greek, university, texas, said, fraternity]",Texas State University suspends Greek life aft...,Nov. 15 (UPI) — Texas State University has sus...,1
3,"[jewish, day, unity, acheinu, people]",Jewish Organization's Huge Day Of Unity On Tue...,Against the backdrop of an increasingly polari...,0
4,"[trump, carrier, jobs, indiana, company]","BREAKING: Trump Reaches Agreement To Keep 1,00...",President-elect Donald Trump has reached an ag...,0
...,...,...,...,...
599879,"[boyega, said, finn, hes, jedi]","John Boyega talks 'Last Jedi,' personal parall...",LOS ANGELES (AP) - John Boyega may have a lot ...,2
599880,"[climate, it8217s, pacific, nations, 8220we]",Sinking Feeling: More Bad News for Pacific Isl...,Kiribati lidian/Shutterstock In climate negot...,4
599881,"[burgers, burger, hamburger, order, hut]","Like home: Big, juicy burgers made the old-fas...",.......... .......... .......... .......... .....,2
599882,"[new, tax, bill, jersey, york]",Decision Time for NY and NJ Republicans on Tax...,It's an important week for the Republican-led ...,3


In [10]:
keyword_dict = {}

for index, words in keywords_df["Keywords"].items():
    for word in words:
        if word in keyword_dict.keys():
            keyword_dict[word].append(index)
        else:
            keyword_dict[word] = [index]

# Keyword Clustering

In [None]:
w2v = gensim.downloader.load('word2vec-google-news-300')



In [None]:
w2v_test= w2v.get_vector("test")
print(w2v_test.shape)

In [None]:
keywords = []
word_vectors = []

for word, indices in tqdm(keyword_dict.items(), total=len(keyword_dict.items())):
    if len(indices) <= 1:
        continue
    if word in w2v:
        keywords.append(word)
        word_vectors.append(w2v.get_vector(word))

keywords = np.array(keywords)

In [None]:
vector_array = np.vstack(word_vectors)

print(len(keywords))
print(len(word_vectors))
print(vector_array.shape)

# Hyperparameter Search for DBSCAN

In [None]:
max_silhouette = -float("inf")
min_dbindex = float("inf")

for ms in range(3,6):
    for eps in np.arange(0.1, 5, 0.2):
        clusters = DBSCAN(eps=eps, min_samples=ms).fit_predict(vector_array)
        if len(set(clusters)) < 3:
            continue
        silhouette = silhouette_score(vector_array, clusters)
        dbindex = davies_bouldin_score(vector_array, clusters)

        if silhouette > max_silhouette:
            max_silhouette = silhouette
            best_s_params = (ms, eps)

        if dbindex < min_dbindex:
            min_dbindex = dbindex
            best_db_params = (ms, eps)



In [None]:
print(best_s_params)
print(max_silhouette)
print(best_db_params)
print(min_dbindex)

In [None]:
cluster_labels = DBSCAN(eps=2, min_samples=4).fit_predict(word_vectors)

In [None]:
print(len(set(cluster_labels)))
print(set(cluster_labels))

In [None]:
cluster_val = 0

keywords = np.array(keywords)

idx = np.where(cluster_labels == cluster_val)[0]
words = keywords[idx]
print(words)

# Hyperparemeter Search for KMeans

In [None]:
inertia_vals = []
k_vals = []

for k in tqdm(range(3, 30)):
    kmeans = KMeans(n_clusters=k, random_state=0, n_init="auto")
    kmeans.fit(vector_array)
    inertia_vals.append(kmeans.inertia_)
    k_vals.append(k)

In [None]:
kn = KneeLocator(
    k_vals,
    inertia_vals,
    curve='convex',
    direction='decreasing',
    interp_method='interp1d',
)

In [None]:
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.plot(k_vals, inertia_vals, 'bx-')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
print(kn.knee)

In [None]:
best_k = 14

kmeans = KMeans(n_clusters=best_k, random_state=0, n_init="auto")
kmeans.fit(vector_array)

kmeans_labels = kmeans.labels_

In [None]:
# Clustering Metrics
silhouette = silhouette_score(vector_array, kmeans_labels)
dbindex = davies_bouldin_score(vector_array, kmeans_labels)

print(f"Silhouette Score: {silhouette}")
print(f"DBIndex Score: {dbindex}")

In [None]:
for i in range(best_k):
    idx = np.where(kmeans_labels == i)[0]
    words = keywords[idx]
    print(len(words))

In [None]:
np.set_printoptions(threshold=np.inf)

In [None]:
# Manually View Keyword Clusters
cluster_val = 4
keywords = np.array(keywords)

idx = np.where(kmeans_labels == cluster_val)[0]
words = keywords[idx]
print(words)

In [None]:
cluster_dict = {
    0: "Political and Social Issues",
    1: "Community Involvement",
    2: "Names, Organizations, and Various Terms [NOISE]", # Potentially Noise
    3: "Locations",
    4: "Administrative Functions (NOISE)", # Potentially Noise
    5: "Legal and Law Enforcement",
    6: "Science and Medicine",
    7: "Noise/Nonsensical Phrases (NOISE)", # Potentially Noise
    8: "Media and Entertainment",
    9: "Various Phrases/Verbs (NOISE)", # Potentially Noise
    10: "Nature and Wildlife",
    11: "Food",
    12: "Finance and Economics",
    13: "Objects and Accessories (NOISE)" # Potentially Noise
}

In [None]:
article_cluster_tags = [set() for i in range(len(keywords_df))]

for i in range(len(keywords)):
    word = keywords[i]
    cluster = kmeans_labels[i]

    cluster_name = cluster_dict[cluster]

    if cluster in [2, 4, 7, 9, 13]:
        continue

    relevant_indices = keyword_dict[word]

    for idx in relevant_indices:
        article_cluster_tags[idx].add(cluster_name)

In [None]:
empty_count = 0

empty_idxs = []

for i, l in enumerate(article_cluster_tags):
    if len(l) == 0:
        empty_count += 1
        empty_idxs.append(i)

print(empty_count)
print(empty_idxs)

In [None]:
output_df = keywords_df.copy()
output_df["Cluster Tags"] = article_cluster_tags

In [None]:
display(output_df.loc[empty_idxs])

In [None]:
cleaned_output = output_df[output_df["Cluster Tags"].astype(bool)]

display(cleaned_output)

In [None]:
# Manually inspect number of occurrences of each cluster tag

counts = {
    "Political and Social Issues": 0,
    "Community Involvement": 0,
    "Locations": 0,
    "Legal and Law Enforcement": 0,
    "Science and Medicine": 0,
    "Media and Entertainment": 0,
    "Nature and Wildlife": 0,
    "Food": 0,
    "Finance and Economics": 0
}

for tags in cleaned_output["Cluster Tags"]:
    for topic in tags:
        counts[topic] += 1

pprint.pp(counts)

In [None]:
cleaned_output.to_csv("full_clustered_articles.csv")