In [3]:
import numpy as np
import pandas as pd
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from collections import Counter

In [1]:
import sys
print(sys.executable)
!pip list | grep scikit-learn


/Users/Mazarinetulp/Documents/nlpass2/NLP_group_9/NLP/.venv/bin/python
scikit-learn              1.5.2


In [4]:

df = pd.read_parquet('data/processed/preprocessed_df.parquet')

print(df.preprocessed_content[:5])

24    [ecb, effect, move, euro, huge, falling, pip, ...
32    [stock, watch, march, trading, session, stock,...
57    [rout, apple, facebook, nasdaq, fb, monday, sw...
78    [immediate, releasechicago, il, january, annou...
82    [company, bringing, innovation, ce, jan, get, ...
Name: preprocessed_content, dtype: object


In [10]:
# Load spaCy model for word embeddings
nlp = spacy.load('en_core_web_sm') #or use medium still to decide 

# Step 2: Extract the tokenized content from the 'preprocessed_content' column
tokenized_articles = df['preprocessed_content'].tolist()

# Step 3: Define a function to get document vectors by averaging word embeddings
def document_vector(doc):
    doc_embeddings = [nlp(word).vector for word in doc if nlp(word).has_vector]
    if len(doc_embeddings) > 0:
        return np.mean(doc_embeddings, axis=0)
    else:
        return np.zeros(nlp.vocab.vectors_length)

# Step 4: Convert all documents to vectors
doc_vectors = np.array([document_vector(doc) for doc in tokenized_articles])

# Step 5: Use KMeans to cluster the documents into topics
num_topics = 5  # Adjust the number of topics as needed
kmeans = KMeans(n_clusters=num_topics, random_state=42)
kmeans.fit(doc_vectors)

# Step 6: Assign topic labels to each document
df['Topic'] = kmeans.labels_

# Step 7: Extract the top words for each topic
def get_top_words_per_topic(df, tokenized_docs, num_top_words=10):
    topic_words = {}
    for topic in range(num_topics):
        # Get all documents in the current topic
        topic_docs = [tokenized_docs[i] for i in range(len(tokenized_docs)) if df['Topic'][i] == topic]
        
        # Flatten the list of tokenized words and count word frequency
        all_words = [word for doc in topic_docs for word in doc]
        word_freq = Counter(all_words)
        
        # Get the most common words in the topic
        most_common_words = word_freq.most_common(num_top_words)
        topic_words[topic] = [word for word, _ in most_common_words]
    
    return topic_words

# Step 8: Get top words for each topic and print them
top_words_per_topic = get_top_words_per_topic(df, tokenized_articles)

for topic, words in top_words_per_topic.items():
    print(f"Topic {topic}: {words}")

# Optional: Save the DataFrame with topic assignments to a new file
df.to_parquet('data/processed/topic_assigned_df.parquet')


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Step 1: Load the preprocessed DataFrame
df = pd.read_parquet('data/processed/preprocessed_df.parquet')

# Step 2: Extract tokenized articles
tokenized_articles = df['preprocessed_content'].tolist()

# Step 3: Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_articles, vector_size=100, window=5, min_count=1, sg=0)

# Step 4: Get word vectors for each article by averaging word vectors in each article
def article_vector(article, model):
    valid_words = [word for word in article if word in model.wv]
    if len(valid_words) >= 1:
        return np.mean([model.wv[word] for word in valid_words], axis=0)
    else:
        return np.zeros(model.vector_size)

# Create vectors for all articles
article_vectors = np.array([article_vector(article, w2v_model) for article in tokenized_articles])

# Step 5: Apply KMeans clustering for topic discovery
# Determine the number of clusters (topics) using the elbow method
def plot_elbow_curve(data, max_k=15):
    distortions = []
    K = range(1, max_k)
    for k in K:
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
        distortions.append(kmeans.inertia_)
    
    plt.figure(figsize=(10,6))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('Elbow Method for Optimal k')
    plt.show()

# Elbow plot to determine number of topics
plot_elbow_curve(article_vectors)

# Select the number of topics (e.g., based on elbow plot)
num_topics = 5  # Example number, replace with the actual number from the elbow plot

# Step 6: Train the final KMeans model
kmeans_model = KMeans(n_clusters=num_topics)
kmeans_model.fit(article_vectors)

# Step 7: Get the cluster labels (i.e., topic assignments for each article)
topic_labels = kmeans_model.labels_

# Display topic assignments
df['Topic'] = topic_labels
print(df[['preprocessed_content', 'Topic']])
