In [1]:
# Importing necessary libraries 

import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import spacy
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
# Loading dataset that includes the 'body_cleaned' column 
df2 = pd.read_csv('/Users/rachelgupta/Desktop/NAACP - PIT NE/pitne-bias-in-media/summer24/notebooks/Rachel/final_combined3.csv')


In [3]:
# Ensuring the 'body_cleaned' column is a string
df2['body_cleaned'] = df2['body_cleaned'].astype(str)

In [4]:


# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(df2['body_cleaned'])

# Fit the KMeans model
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train_tfidf)

# Assign clusters to the articles
df2['cluster'] = kmeans.predict(vectorizer.transform(df2['body_cleaned']))

In [5]:
import numpy as np

# Get the terms (words) from the vectorizer
terms = vectorizer.get_feature_names_out()

# Get the centroids (top terms) of each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

# Print the top terms per cluster
for i in range(kmeans.n_clusters):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :20]:  
        print(f" {terms[ind]}")
    print()

Cluster 0:
 said
 boston
 people
 like
 new
 says
 com
 year
 time
 globe
 years
 family
 music
 body
 www
 day
 massachusetts
 life
 school
 work

Cluster 1:
 game
 celtics
 said
 season
 team
 points
 games
 play
 sox
 bruins
 coach
 win
 players
 going
 second
 good
 year
 got
 league
 scored

Cluster 2:
 baker
 said
 campaign
 state
 republican
 governor
 party
 democratic
 senate
 voters
 warren
 brown
 political
 election
 candidates
 patrick
 massachusetts
 democrats
 coakley
 candidate

Cluster 3:
 boston
 penalty
 11
 10
 bb
 2b
 12
 17
 13
 14
 totals
 18
 period
 bogaerts
 19
 3b
 shots
 ip
 16
 1b

Cluster 4:
 said
 state
 massachusetts
 percent
 health
 school
 company
 year
 students
 new
 boston
 people
 million
 city
 public
 care
 000
 police
 schools
 companies



In [6]:
# Display sample documents from each cluster
for cluster in range(kmeans.n_clusters):
    print(f"Cluster {cluster}:")
    sample_docs = df2[df2['cluster'] == cluster]['body_cleaned'].sample(3, random_state=42).values  # Adjust the number of samples as needed
    for doc in sample_docs:
        print(f" - {doc}")
    print()

Cluster 0:
 - peck j. todd harris , also serves film ' producer , hoping brings increased exposure lacrosse , already one fastest growing sports country . harris said $ 8 million film funded lacrosse aficionados supporters . scheduled released next year . film , crooked arrows , focuses native american lacrosse team competes better- trained prep schools . independent film stars brandon routh ( ` superman ' ) gil birmingham ( ` twilight ' ) , tv film actors chelsea ricketts crystal allen . dozens massachusetts high school college lacrosse players serve extras film , hand saturday st. john ' prep danvers , film ' climactic scenes shot championship game featuring crooked arrows . film ' producers contacted area high school youth coaches leagues recruit families lacrosse players fill stands extras . peck said extras take part aug. 13 filming register crookedarrows . com website . filming begins 7 a. m. lunch provided . also said shirts , lacrosse sticks , memorabilia movie given away filmi

In [7]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_train_tfidf.toarray())

# Plot the t-SNE results
plt.figure(figsize=(10, 6))
for cluster in range(kmeans.n_clusters):
    plt.scatter(X_tsne[df2['cluster'] == cluster, 0], X_tsne[df2['cluster'] == cluster, 1], label=f'Cluster {cluster}')
plt.legend()
plt.title("t-SNE Visualization of Document Clusters")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()