In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Restart runtime before running this cell for a fresh start

#!pip install gensim --force-reinstall
#!pip install scikit-learn joblib


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.3.0.post1-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading gensim-4.3.



In [1]:
!pip freeze > requirements.txt

In [2]:
from google.colab import files
files.download("requirements.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#!pip install pyLDAvis


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import joblib
from sklearn.metrics.pairwise import cosine_similarity


# Load the Excel file
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx')

# Extract the list of documents (texts) you want to use for LDA
my_texts = df['Text'].astype(str).tolist()  # make sure all texts are strings


In [None]:
import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

In [None]:
# === Original functions ===
def bag_of_words(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

def n_grams(texts, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

def tfidf(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer

def extract_ner_features(texts):
    features = []
    for text in texts:
        doc = nlp(text)
        ents = [ent.label_ for ent in doc.ents]
        ent_counts = {label: ents.count(label) for label in set(ents)}
        features.append(ent_counts)
    return features

def word2vec_encoding(texts, vector_size=100, window=5, min_count=1):
    tokenized_texts = [simple_preprocess(doc) for doc in texts]
    model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window,
                     min_count=min_count, workers=4, seed=42)
    doc_vectors = []
    for tokens in tokenized_texts:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        if vectors:
            doc_vectors.append(np.mean(vectors, axis=0))
        else:
            doc_vectors.append(np.zeros(vector_size))
    return np.array(doc_vectors), model

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import numpy as np

def doc2vec_encoding(texts, vector_size=100, window=5, min_count=1, epochs=40):
    # Tag each document with an ID for training
    tagged_docs = [TaggedDocument(words=simple_preprocess(doc), tags=[str(i)]) for i, doc in enumerate(texts)]

    # Initialize and train Doc2Vec model
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=4, seed=42, epochs=epochs)
    model.build_vocab(tagged_docs)
    model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)

    # Get vectors for each document by tag
    doc_vectors = np.array([model.dv[str(i)] for i in range(len(texts))])

    return doc_vectors, model


In [None]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel

def prepare_corpus(texts):
    tokenized_texts = [text.split() for text in texts]  # or use more sophisticated tokenizer
    dictionary = Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
    return tokenized_texts, dictionary, corpus

def evaluate_lda_coherence_gensim(texts, topic_range=range(4, 7)):
    tokenized_texts, dictionary, corpus = prepare_corpus(texts)
    scores = []
    for n in topic_range:
        model = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=n,
            random_state=42,
            passes=5,
            iterations=50,
            workers=4  # Adjust this to your CPU cores
        )
        cm = CoherenceModel(model=model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
        scores.append((n, cm.get_coherence()))
    return scores

def lda_features_gensim(texts, n_topics, passes=20, iterations=100):
    tokenized_texts, dictionary, corpus = prepare_corpus(texts)
    lda_model = LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=n_topics,
        random_state=42,
        passes=passes,
        iterations=iterations,
        workers=4
    )
    features = []
    for bow in corpus:
        doc_topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
        features.append([prob for _, prob in doc_topics])
    return features, lda_model, dictionary


In [None]:
lda_vectors, lda_model, dictionary = lda_features_gensim(my_texts, n_topics=5)

In [None]:
import pandas as pd
import numpy as np
import joblib
from gensim.models import Word2Vec
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary

# === Load dataset ===
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx')
my_texts = df['Text'].astype(str).tolist()
labels = df['Label']
numeric_labels = pd.factorize(labels)[0]
# print(numeric_labels)

# === Apply and Save Feature Sets ===
def apply_and_save_features():
    output_dir = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering'
    os.makedirs(output_dir, exist_ok=True)

    # 1. Bag of Words
    bow_matrix, bow_vectorizer = bag_of_words(my_texts)
    df_bow = pd.DataFrame(bow_matrix, columns=[f'bow_{i}' for i in range(bow_matrix.shape[1])])
    if labels is not None:
        df_bow['label'] = labels
        df_bow['label_num'] = numeric_labels
    df_bow.to_pickle(f'{output_dir}/features_bow.pkl')
    joblib.dump(bow_vectorizer, f'{output_dir}/vectorizer_bow.pkl')

    # 2. TF-IDF
    tfidf_matrix, tfidf_vectorizer = tfidf(my_texts)
    df_tfidf = pd.DataFrame(tfidf_matrix, columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])
    if labels is not None:
        df_tfidf['label'] = labels
        df_tfidf['label_num'] = numeric_labels
    df_tfidf.to_pickle(f'{output_dir}/features_tfidf.pkl')
    joblib.dump(tfidf_vectorizer, f'{output_dir}/vectorizer_tfidf.pkl')

    # 3. N-Grams (bi-gram)
    ngram_matrix, ngram_vectorizer = n_grams(my_texts, n=2)
    df_ngram = pd.DataFrame(ngram_matrix, columns=[f'ngram2_{i}' for i in range(ngram_matrix.shape[1])])
    if labels is not None:
        df_ngram['label'] = labels
        df_ngram['label_num'] = numeric_labels
    df_ngram.to_pickle(f'{output_dir}/features_ngram2.pkl')
    joblib.dump(ngram_vectorizer, f'{output_dir}/vectorizer_ngram2.pkl')

    # 4. LDA
    lda_vectors, lda_model, dictionary = lda_features_gensim(my_texts, n_topics=5, passes=20, iterations=100)
    df_lda = pd.DataFrame(lda_vectors, columns=[f'lda_{i}' for i in range(len(lda_vectors[0]))])
    if labels is not None:
        df_lda['label'] = labels
        df_lda['label_num'] = numeric_labels
    df_lda.to_pickle(f'{output_dir}/features_lda.pkl')
    lda_model.save(f'{output_dir}/lda_model_5topics.model')
    dictionary.save(f'{output_dir}/lda_dictionary.dict')

    # 5. Word2Vec
    w2v_matrix, w2v_model = word2vec_encoding(my_texts)
    df_w2v = pd.DataFrame(w2v_matrix, columns=[f'w2v_{i}' for i in range(w2v_matrix.shape[1])])
    if labels is not None:
        df_w2v['label'] = labels
        df_w2v['label_num'] = numeric_labels
    df_w2v.to_pickle(f'{output_dir}/features_word2vec.pkl')
    w2v_model.save(f'{output_dir}/word2vec.model')

    # 6. Doc2Vec
    d2v_matrix, d2v_model = doc2vec_encoding(my_texts)
    df_d2v = pd.DataFrame(d2v_matrix, columns=[f'd2v_{i}' for i in range(d2v_matrix.shape[1])])
    if labels is not None:
        df_d2v['label'] = labels
        df_d2v['label_num'] = numeric_labels
    df_d2v.to_pickle(f'{output_dir}/features_doc2vec.pkl')
    d2v_model.save(f'{output_dir}/doc2vec.model')



In [None]:
pd.DataFrame({
    'text': df['Text'].astype(str).tolist(),
    'label': df['Label'],
    'label_num': pd.factorize(labels)[0]
}).to_pickle('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/raw_labeled.pkl')


In [None]:
import os

apply_and_save_features()

In [None]:
import os

output_dir = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering'
files = [
    'features_bow.pkl', 'vectorizer_bow.pkl',
    'features_tfidf.pkl', 'vectorizer_tfidf.pkl',
    'features_ngram2.pkl', 'vectorizer_ngram2.pkl',
    'features_lda.pkl', 'lda_model_5topics.model', 'lda_dictionary.dict',
    'features_word2vec.pkl', 'word2vec.model',
    'features_doc2vec.pkl', 'doc2vec.model'
]

for f in files:
    full_path = os.path.join(output_dir, f)
    print(f"{f}: {' Found' if os.path.exists(full_path) else ' Missing'}")


features_bow.pkl:  Found
vectorizer_bow.pkl:  Found
features_tfidf.pkl:  Found
vectorizer_tfidf.pkl:  Found
features_ngram2.pkl:  Found
vectorizer_ngram2.pkl:  Found
features_lda.pkl:  Found
lda_model_5topics.model:  Found
lda_dictionary.dict:  Found
features_word2vec.pkl:  Found
word2vec.model:  Found
features_doc2vec.pkl:  Found
doc2vec.model:  Found


In [None]:
import pandas as pd
from scipy.sparse import save_npz
import joblib

# Load dataset with 'Text' column (your cleaned and preprocessed text)
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx')
df['Text'] = df['Text'].fillna('')

# Load existing vectorizer
vectorizer = joblib.load('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/vectorizer_tfidf.pkl')

# Transform text data (do NOT fit again!)
tfidf_matrix = vectorizer.transform(df['Text'])

# Save the sparse matrix
save_npz('/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering/tfidf_matrix.npz', tfidf_matrix)

print("TF-IDF matrix created and saved successfully.")


TF-IDF matrix created and saved successfully.
