In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Text Preprocessing Function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Step 2: Create and preprocess the documents
raw_dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]
dataset = [preprocess(doc) for doc in raw_dataset]

# Step 3: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

# Step 4: Perform KMeans clustering
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)
y_pred = km.predict(X)

# Step 5: Display cluster assignment
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(raw_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Step 6: Top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
    print()

# Step 7: Evaluate Purity
total_samples = len(y_pred)
cluster_label_counts = Counter(y_pred)
purity = max(cluster_label_counts.values()) / total_samples
print("Purity:", purity)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    1

Top terms per cluster:
Cluster 0:
 camping
 enjoy
 hiking
 mountains
 weekends
 listening
 concerts
 football
 games
 going

Cluster 1:
 love
 playing
 football
 weekends
 going
 sports
 music
 concerts
 video
 games

Purity: 0.8


In [3]:
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Step 1: Text Preprocessing Function
def preprocess_tokens(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Step 2: Create and preprocess the documents
raw_dataset = [
    "I love playing football on the weekends",
    "I enjoy hiking and camping in the mountains",
    "I like to read books and watch movies",
    "I prefer playing video games over sports",
    "I love listening to music and going to concerts"
]
tokenized_dataset = [preprocess_tokens(doc) for doc in raw_dataset]

# Step 3: Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

# Step 4: Create document embeddings by averaging word vectors
X = np.array([
    np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)
    for doc in tokenized_dataset
])

# Step 5: Perform KMeans clustering
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)
y_pred = km.predict(X)

# Step 6: Display cluster assignment
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[raw_doc, cluster] for raw_doc, cluster in zip(raw_dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Step 7: Evaluate Purity
total_samples = len(y_pred)
cluster_label_counts = Counter(y_pred)
purity = max(cluster_label_counts.values()) / total_samples
print("Purity:", purity)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            1
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              0
I prefer playing video games over sports                           1
I love listening to music and going to concerts                    0
Purity: 0.6


In [5]:
#exercise 2

In [9]:
# Text Clustering using TF-IDF (with Preprocessing)
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tabulate import tabulate
from collections import Counter
import nltk

# Download NLTK resources (only needed once)
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Load Dataset
df = pd.read_csv("customer_complaints_1.csv")
texts = df['text'].dropna().astype(str).tolist()

# 2. Text Preprocessing Function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

# Apply preprocessing
cleaned_texts = [preprocess(doc) for doc in texts]

# 3. TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_texts)

# 4. KMeans Clustering
k = 2
km = KMeans(n_clusters=k, random_state=42)
km.fit(X)
y_pred = km.predict(X)

# 5. Display Results
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(df['text'], y_pred)])
print(tabulate(table_data[:10], headers="firstrow"))  # Show only first 10 for clarity

# 6. Top Terms per Cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f"  {terms[ind]}")
    print()

# 7. Evaluate Purity
total_samples = len(y_pred)
cluster_label_counts = Counter(y_pred)
purity = max(cluster_label_counts.values()) / total_samples
print("Purity:", round(purity, 2))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                