In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load data
df = pd.read_csv('tweets_sentiment.csv')
tweets = df['Tweet'].tolist()

# Preprocessing function
def preprocess(text):
    text = re.sub(r'http\S+|@\w+|#|[\W_]', ' ', str(text))  # Remove URLs, mentions, hashtags, and punctuation
    text = text.lower().strip()
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]  # Remove stopwords and short tokens
    return ' '.join(tokens)

processed_tweets = [preprocess(tweet) for tweet in tweets]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(processed_tweets, convert_to_tensor=True)

  from .autonotebook import tqdm as notebook_tqdm





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [3]:
from sklearn.cluster import KMeans
import numpy as np

# Convert embeddings to numpy for clustering
embeddings_np = embeddings.cpu().numpy()

# Cluster tweets (example: 5 clusters)
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings_np)

[WinError 2] The system cannot find the file specified
  File "c:\Users\prant\anaconda3\envs\aienv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\prant\anaconda3\envs\aienv\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\prant\anaconda3\envs\aienv\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\prant\anaconda3\envs\aienv\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract top keywords per cluster
top_keywords = []

for cluster_id in range(num_clusters):
    cluster_tweets = [processed_tweets[i] for i, c in enumerate(clusters) if c == cluster_id]
    
    # Compute TF-IDF for the cluster
    tfidf = TfidfVectorizer(max_features=50)
    tfidf_matrix = tfidf.fit_transform(cluster_tweets)
    feature_names = tfidf.get_feature_names_out()
    
    # Get top 10 words by TF-IDF score
    scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
    sorted_indices = np.argsort(scores)[::-1]
    keywords = [feature_names[i] for i in sorted_indices[:10]]
    top_keywords.extend(keywords)


# Rank keywords globally and create DataFrame
from collections import Counter
keyword_counter = Counter(top_keywords)
most_meaningful_words = keyword_counter.most_common(10)

# Convert to DataFrame
keywords_df = pd.DataFrame(most_meaningful_words, columns=['Keyword', 'Frequency'])
keywords_df.index += 1  # Start index at 1 for readability

print("\nTop 10 Most Meaningful Words:")
print(keywords_df.to_string(index=True))


Top 10 Most Meaningful Words:
         Keyword  Frequency
1           war2          5
2         trisha          2
3   hardikpandya          2
4            ntr          1
5          actor          1
6           lead          1
7         pathan          1
8        percent          1
9          cameo          1
10           100          1


In [14]:
# ... (previous code for preprocessing and clustering)

# Extract phrases and sentences
from sklearn.metrics.pairwise import cosine_similarity

# Convert embeddings to numpy upfront (add this line)
embeddings_np = embeddings.cpu().numpy()  # If using GPU, otherwise just .numpy()

# Initialize containers
cluster_keyphrases = []
cluster_sentences = []

for cluster_id in range(num_clusters):
    cluster_indices = [i for i, c in enumerate(clusters) if c == cluster_id]
    cluster_processed = [processed_tweets[i] for i in cluster_indices]
    cluster_original = [df['Tweet'].iloc[i] for i in cluster_indices]
    
    # ======================================
    # 1. Extract Keyphrases (n-grams)
    # ======================================
    tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=50)
    tfidf_matrix = tfidf.fit_transform(cluster_processed)
    feature_names = tfidf.get_feature_names_out()
    
    scores = np.asarray(tfidf_matrix.sum(axis=0)).ravel()
    sorted_indices = np.argsort(scores)[::-1]
    keyphrases = [feature_names[i] for i in sorted_indices[:10]]
    cluster_keyphrases.extend(keyphrases)
    
    # ======================================
    # 2. Find Representative Sentence (FIXED)
    # ======================================
    # Use embeddings_np instead of embeddings tensor
    cluster_embeddings = embeddings_np[cluster_indices]
    
    # Calculate centroid using numpy
    centroid = np.mean(cluster_embeddings, axis=0)
    
    # Find most similar sentence
    similarities = cosine_similarity([centroid], cluster_embeddings)
    most_representative_idx = np.argmax(similarities)
    representative_sentence = cluster_original[most_representative_idx]
    
    cluster_sentences.append(representative_sentence)

# ... (rest of the DataFrame creation code remains the same)

# ======================================
# Create Final DataFrame
# ======================================
# Top phrases across all clusters
phrase_counter = Counter(cluster_keyphrases)
top_phrases = phrase_counter.most_common(10)

# Cluster-wise representative sentences
cluster_results = []
for cluster_id in range(num_clusters):
    cluster_results.append({
        'Cluster': cluster_id + 1,
        'Top Phrases': ", ".join([p for p, _ in phrase_counter.most_common(5)]),
        'Representative Sentence': cluster_sentences[cluster_id]
    })

# Create DataFrames
phrases_df = pd.DataFrame(top_phrases, columns=['Phrase', 'Frequency'])
phrases_df.index += 1

clusters_df = pd.DataFrame(cluster_results)
clusters_df.index += 1

# Display results
print("\n" + "="*55)
print("Top 10 Meaningful Phrases (Words and Bigrams)")
print("="*55)
print(phrases_df.to_string(index=True, justify='center'))


Top 10 Meaningful Phrases (Words and Bigrams)
       Phrase     Frequency
1           war2      5    
2         trisha      2    
3      war2 lead      1    
4            ntr      1    
5      actor ntr      1    
6          actor      1    
7           lead      1    
8     lead actor      1    
9         pathan      1    
10  pathan cameo      1    
