In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import joblib

# Load the Excel file
sampled_labeled_partitions = pd.read_csv('cleaned_labeled_partitions.csv')

# === Original teammate functions ===
def bag_of_words(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

def n_grams(texts, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

def tfidf(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

def lda_encoding(texts, n_topics=5):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda_features = lda_model.fit_transform(X)
    return lda_features, lda_model, vectorizer

def word2vec_encoding(texts, vector_size=100, window=5, min_count=1):
    tokenized_texts = [simple_preprocess(doc) for doc in texts]
    model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window,
                     min_count=min_count, workers=4, seed=42)
    doc_vectors = []
    for tokens in tokenized_texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            doc_vectors.append(np.mean(vectors, axis=0))
        else:
            doc_vectors.append(np.zeros(vector_size))
    return np.array(doc_vectors), model

# === Load the sampled data ===
# sampled_labeled_partitions should already exist in your notebook
texts = sampled_labeled_partitions['text'].tolist()
labels = sampled_labeled_partitions['label']
numeric_labels = pd.factorize(labels)[0]

# === Apply features ===
# Bag of Words
bow_matrix, bow_vectorizer = bag_of_words(texts)
df_bow = pd.DataFrame(bow_matrix, columns=[f'bow_{i}' for i in range(bow_matrix.shape[1])])
df_bow['label'] = labels
df_bow['label_num'] = numeric_labels
df_bow.to_pickle('bow.pkl')
joblib.dump(bow_vectorizer, 'bow_vectorizer.pkl')

# TF-IDF
tfidf_matrix, tfidf_vectorizer = tfidf(texts)
df_tfidf = pd.DataFrame(tfidf_matrix, columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
df_tfidf['label'] = labels
df_tfidf['label_num'] = numeric_labels
df_tfidf.to_pickle('tfidf.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# N-grams (bigrams)
ngram_matrix, ngram_vectorizer = n_grams(texts, n=2)
df_ngram = pd.DataFrame(ngram_matrix, columns=[f'ngram_{i}' for i in range(ngram_matrix.shape[1])])
df_ngram['label'] = labels
df_ngram['label_num'] = numeric_labels
df_ngram.to_pickle('ngram.pkl')
joblib.dump(ngram_vectorizer, 'ngram_vectorizer.pkl')

# LDA
lda_matrix, lda_model, lda_vectorizer = lda_encoding(texts, n_topics=5)
df_lda = pd.DataFrame(lda_matrix, columns=[f'lda_topic_{i}' for i in range(lda_matrix.shape[1])])
df_lda['label'] = labels
df_lda['label_num'] = numeric_labels
df_lda.to_pickle('lda.pkl')
joblib.dump(lda_model, 'lda_model.pkl')
joblib.dump(lda_vectorizer, 'lda_vectorizer.pkl')

# Word2Vec
w2v_matrix, w2v_model = word2vec_encoding(texts, vector_size=100, window=5, min_count=1)
df_w2v = pd.DataFrame(w2v_matrix, columns=[f'w2v_{i}' for i in range(w2v_matrix.shape[1])])
df_w2v['label'] = labels
df_w2v['label_num'] = numeric_labels
df_w2v.to_pickle('word2vec.pkl')
w2v_model.save('word2vec.model')

print("Feature engineering complete. All features saved.")


Feature engineering complete. All features saved.
