In [1]:
# TF-IDF VECTORIZER

#import the libraries
import numpy as np 
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter

#create the document
dataset = ["I love playing football on the weekends", 
           "I enjoy hiking and camping in the mountains", 
           "I like to read books and watch movies", 
           "I prefer playing video games over sports", 
           "I love listening to music and going to concerts"]

#vectorize the dataset
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(dataset)

#perform clustering
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k) 
km.fit(X) 
 
# Predict the clusters for each document 
y_pred = km.predict(X) 
 
# Display the document and its predicted cluster in a table 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow")) 

# Print top terms per cluster 
print("\nTop terms per cluster:") 
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
terms = vectorizer.get_feature_names_out() 
for i in range(k): 
    print("Cluster %d:" % i) 
    for ind in order_centroids[i, :10]: 
        print(' %s' % terms[ind]) 
    print()

# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity) 

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games

Cluster 1:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music

Purity: 0.6


In [3]:
# TF-IDF VECTORIZER (after preprocessing)

# import the libraries
import numpy as np 
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer 
from tabulate import tabulate 
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# text preprocessing function
def preprocess(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation and numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # remove stopwords
    return ' '.join(tokens)

# create the document and apply preprocessing
raw_dataset = [
    "I love playing football on the weekends", 
    "I enjoy hiking and camping in the mountains", 
    "I like to read books and watch movies", 
    "I prefer playing video games over sports", 
    "I love listening to music and going to concerts"
]
dataset = [preprocess(doc) for doc in raw_dataset]

# vectorize the dataset
vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(dataset)

# perform clustering
k = 2  # Define the number of clusters 
km = KMeans(n_clusters=k, random_state=42) 
km.fit(X) 

# Predict the clusters for each document 
y_pred = km.predict(X) 

# Display the document and its predicted cluster in a table 
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(raw_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow")) 

# Print top terms per cluster 
print("\nTop terms per cluster:") 
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
terms = vectorizer.get_feature_names_out() 
for i in range(k): 
    print("Cluster %d:" % i) 
    for ind in order_centroids[i, :10]: 
        print(' %s' % terms[ind]) 
    print()

# Calculate purity 
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity)

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 camping
 enjoy
 hiking
 mountains
 weekends
 listening
 concerts
 football
 games
 going

Cluster 1:
 love
 playing
 football
 weekends
 going
 sports
 music
 concerts
 video
 games

Purity: 0.8


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
pip install --upgrade scipy




In [19]:
# import libraries
import numpy as np 
from sklearn.cluster import KMeans 
from gensim.models import Word2Vec 
from tabulate import tabulate 
from collections import Counter 
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# original documents
raw_dataset = [
    "I love playing football on the weekends", 
    "I enjoy hiking and camping in the mountains", 
    "I like to read books and watch movies", 
    "I prefer playing video games over sports", 
    "I love listening to music and going to concerts"
]

# apply preprocessing
dataset = [preprocess(doc) for doc in raw_dataset]

# train Word2Vec model 
tokenized_dataset = [doc.split() for doc in dataset] 
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

# create document embeddings 
X = np.array([
    np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv] or [np.zeros(100)], axis=0)
    for doc in tokenized_dataset
])

# perform clustering
k = 2  # number of clusters 
km = KMeans(n_clusters=k, random_state=42) 
km.fit(X) 

# predict clusters
y_pred = km.predict(X)

# tabulate original (non-preprocessed) document with its predicted cluster
table_data = [["Document", "Predicted Cluster"]] 
table_data.extend([[doc, cluster] for doc, cluster in zip(raw_dataset, y_pred)]) 
print(tabulate(table_data, headers="firstrow"))

# calculate purity
total_samples = len(y_pred) 
cluster_label_counts = [Counter(y_pred)] 
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples 
print("Purity:", purity)


ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\Users\Emily\anaconda3\Lib\site-packages\scipy\linalg\__init__.py)

In [27]:
#Question 2

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from tabulate import tabulate
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load and preprocess the dataset
df = pd.read_csv("customer_complaints_1.csv")
df = df[['text']].dropna()  # Use lowercase column name

# Preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
df['Processed_Text'] = df['text'].apply(preprocess)

# Limit for performance (optional)
df = df.head(1000)

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Processed_Text'])

# Perform KMeans clustering
k = 5
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)
y_pred = km.predict(X)
df['Cluster'] = y_pred

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print()

# Silhouette score as a proxy for clustering quality
silhouette = silhouette_score(X, y_pred)
print(f"\nSilhouette Score: {silhouette:.4f}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emily\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Top terms per cluster:
Cluster 0:
 service
 since
 adding
 boxes
 second
 customer
 rude
 joke
 never
 malfunction

Cluster 1:
 internet
 comcast
 rep
 tech
 security
 cable
 rude
 burial
 us
 mins

Cluster 2:
 internet
 im
 dealing
 interruptions
 lead
 stopping
 late
 verizon
 multiple
 poor

Cluster 3:
 mbps
 speed
 contract
 service
 day
 internet
 customer
 years
 blast
 call

Cluster 4:
 contract
 would
 xfinity
 months
 time
 get
 terrible
 hardware
 signed
 customer


Silhouette Score: 0.0004
