In [6]:
!pip install swifter
!pip install gensim
!pip install nltk
!pip install spacy
!pip install pandas
!pip install matplotlib
!pip install owlready2
!pip install vaderSentiment

Collecting owlready2
  Downloading owlready2-0.49.tar.gz (27.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: owlready2
  Building wheel for owlready2 (pyproject.toml) ... [?25l[?25hdone
  Created wheel for owlready2: filename=owlready2-0.49-py3-none-any.whl size=23742426 sha256=9694f4903ce90cd38609280dcad24e35483167bf75139cbdb52d682a853868fb
  Stored in directory: /root/.cache/pip/wheels/43/fe/dc/a0de3c289cfd5923ece6524469d328950e14fa0c90b1088ffa
Successfully built owlready2
Installing collected packages: owlready2
Successfully installed owlready2-0.49


In [17]:
import pandas as pd
import swifter
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import nltk

import data_loader
import preprocessor
import visualization
import modeling
import ontology_manager
import embeddings_handler

In [14]:
df = pd.read_json("/content/mini_hotel_reviews.json", lines=True)

In [None]:
# # Cell 2: Load Data
# df = data_loader.load_and_create_mini_dataset(
#     file_path='hotel_reviews.json',
#     mini_file_path='mini_hotel_reviews.json'
# )

In [18]:
# Cell 3: Preprocessing (NER, POS, Cleaning)
# # Note: swifter speeds up the apply function
# print("Applying NER, POS, and Cleaning...")
# df['ner_text'] = df['text'].swifter.apply(preprocessor.replace_entities)
# df['pos_text'] = df['text'].swifter.apply(preprocessor.replace_with_pos)

nltk.download('punkt_tab')
df['clean_tokens'] = df['text'].swifter.apply(preprocessor.preprocess_text)
# df['clean_tokens_ner'] = df['ner_text'].swifter.apply(preprocessor.preprocess_text)
# df['clean_tokens_pos'] = df['pos_text'].swifter.apply(preprocessor.preprocess_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Pandas Apply:   0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
# # Cell 4: N-Grams creation
# df['bigrams'] = df['clean_tokens'].swifter.apply(lambda x: preprocessor.generate_ngrams(x, 2))
# df['trigrams'] = df['clean_tokens'].swifter.apply(lambda x: preprocessor.generate_ngrams(x, 3))

In [None]:
# # Cell 5: Stemming
# df['porter_stem'] = df['clean_tokens'].swifter.apply(preprocessor.get_porter_tokens)
# df['snowball_stem'] = df['clean_tokens'].swifter.apply(preprocessor.get_snowball_tokens)
# df['lancaster_stem'] = df['clean_tokens'].swifter.apply(preprocessor.get_lancaster_tokens)

In [20]:
# Define Target Class
df['class'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)

In [None]:
# # Cell 6: Visualization (EDA)
# print("Visualizing Top Unigrams:")
# visualization.plot_top_ngrams(df['clean_tokens'], "Unigrams")

# print("Visualizing Top Bigrams:")
# visualization.plot_top_ngrams(df['bigrams'], "Bigrams", palette="plasma")

In [None]:
# # Cell 7: Feature Engineering (Vectorization)
# token_cols = ['clean_tokens', 'clean_tokens_ner', 'clean_tokens_pos',
#               'porter_stem', 'snowball_stem']
# ngram_settings = [(1, 1), (1, 2)] # reduced for speed, add (2,2) or (3,3) if needed

# vectorized_data = modeling.vectorize_all_datasets(df, token_cols, ngram_settings)

In [None]:
# # Cell 8: Model Training & GridSearch
# # Define models
# models_config = {
#     'DecisionTree': (DecisionTreeClassifier(random_state=42), {'max_depth': [10, 20]}),
#     'LogisticRegression': (LogisticRegression(max_iter=1000), {'C': [0.1, 1.0]}),
#     'MLP_NeuralNet': (MLPClassifier(max_iter=500), {'hidden_layer_sizes': [(50,)]})
# }

# modeling.run_model_experiments(vectorized_data, models_config)

In [None]:
# # Cell 9: Ontology Definition & Population
# ontology_manager.define_ontology()
# # We use head(50) to save time, remove .head(50) to do full dataset
# populated_onto = ontology_manager.populate_ontology(df.head(50))

In [None]:
# # Cell 10: Ontology Visualization
# dot_graph = ontology_manager.visualize_ontology_structure()
# dot_graph # This will display the graph in the notebook

In [25]:
import os
import zipfile
import urllib.request
import gzip
import shutil
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# SETUP: PRE-TRAINED MODEL DOWNLOADER
def download_file(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filename)
        print("Download complete!")
    else:
        print(f"Found {filename}, skipping download.")

# EXPERIMENT SETUP
embedding_datasets = {}
handlers = {}

# --- 1. LOCAL MODELS (Trained on YOUR Hotel Data) ---
print("\n" + "="*60)
print("PHASE 1: Local Training (Domain Specific)")
print("="*60)

# Local Word2Vec
print("Training Local Word2Vec...")
h_w2v = embeddings_handler.EmbeddingsHandler()
h_w2v.train_word2vec(df['clean_tokens'], vector_size=100, min_count=2)
handlers['Local_W2V'] = h_w2v
X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_w2v)
embedding_datasets['Local_W2V'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Local FastText
print("Training Local FastText...")
h_ft = embeddings_handler.EmbeddingsHandler()
h_ft.train_fasttext(df['clean_tokens'], vector_size=100, min_count=2)
handlers['Local_FT'] = h_ft
X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_ft)
embedding_datasets['Local_FastText'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 2. PRE-TRAINED GLOVE (Wikipedia + Gigaword) ---
print("\n" + "="*60)
print("PHASE 2: Pre-trained GloVe (Wikipedia)")
print("="*60)

download_file("http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")
if not os.path.exists("glove.6B.100d.txt"):
    with zipfile.ZipFile("glove.6B.zip", 'r') as z:
        z.extract("glove.6B.100d.txt")

h_glove = embeddings_handler.EmbeddingsHandler()
h_glove.load_glove("glove.6B.100d.txt")
handlers['Pre_GloVe'] = h_glove

X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_glove)
embedding_datasets['Pre_GloVe_100d'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 3. PRE-TRAINED WORD2VEC (Google News - 300 dimensions) ---
print("\n" + "="*60)
print("PHASE 3: Pre-trained Word2Vec (Google News 100 Billion Words)")
print("="*60)
# Note: This is a 1.5GB download
gn_url = "https://figshare.com/ndownloader/files/10798046"
gn_path = "GoogleNews-vectors-negative300.bin"

download_file(gn_url, gn_path)

h_gn = embeddings_handler.EmbeddingsHandler()
# Manually loading because it's a specific binary format
print("Loading GoogleNews Model (This takes a moment)...")
h_gn.model = KeyedVectors.load_word2vec_format(gn_path, binary=True)
h_gn.vector_size = 300  # Google News is 300d
handlers['Pre_GoogleNews'] = h_gn

X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_gn)
embedding_datasets['Pre_W2V_GoogleNews'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 4. PRE-TRAINED FASTTEXT (Wiki News - 300 dimensions) ---
print("\n" + "="*60)
print("PHASE 4: Pre-trained FastText (Wiki News)")
print("="*60)

ft_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
ft_zip = "wiki-news-300d-1M.vec.zip"
ft_vec = "wiki-news-300d-1M.vec"

download_file(ft_url, ft_zip)
if not os.path.exists(ft_vec):
    print("Extracting FastText...")
    with zipfile.ZipFile(ft_zip, 'r') as z:
        z.extract(ft_vec)

h_pre_ft = embeddings_handler.EmbeddingsHandler()
print("Loading Pre-trained FastText...")
# Standard .vec format is text-based (binary=False)
h_pre_ft.model = KeyedVectors.load_word2vec_format(ft_vec, binary=False)
h_pre_ft.vector_size = 300
handlers['Pre_FastText'] = h_pre_ft

X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_pre_ft)
embedding_datasets['Pre_FastText_Wiki'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 5. SEMANTIC COMPARISON ---
print("\n" + "="*60)
print("SEMANTIC COMPARISON: Local vs Global")
print("="*60)
test_word = "staff"
print(f"Neighbors for '{test_word}':")
for name, h in handlers.items():
    try:
        words = [w[0] for w in h.find_similar_words(test_word, topn=3)]
        print(f"{name:>15}: {words}")
    except:
        print(f"{name:>15}: Word not in vocabulary")


PHASE 1: Local Training (Domain Specific)
Training Local Word2Vec...
Training Word2Vec (size=100) on 10000 documents...
Generating document vectors using clean_tokens...
Training Local FastText...
Training FastText (size=100) on 10000 documents...
Generating document vectors using clean_tokens...

PHASE 2: Pre-trained GloVe (Wikipedia)
Found glove.6B.zip, skipping download.
Loading GloVe model from glove.6B.100d.txt.w2v...
Generating document vectors using clean_tokens...

PHASE 3: Pre-trained Word2Vec (Google News 100 Billion Words)
Found GoogleNews-vectors-negative300.bin, skipping download.
Loading GoogleNews Model (This takes a moment)...
Generating document vectors using clean_tokens...

PHASE 4: Pre-trained FastText (Wiki News)
Found wiki-news-300d-1M.vec.zip, skipping download.
Loading Pre-trained FastText...
Generating document vectors using clean_tokens...

SEMANTIC COMPARISON: Local vs Global
Neighbors for 'staff':
      Local_W2V: ['helpful', 'employees', 'professional']
  

In [24]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

print("\n" + "="*60)
print("FINAL RESULTS: Training Multiple Classifiers")
print("="*60)

models_emb_config = {
    # 1. Logistic Regression (Baseline - Fast & Good)
    'LogReg': (
        LogisticRegression(max_iter=1000),
        {'C': [1.0, 10.0]}
    ),

    # 2. Linear SVM (Often the best for text/embeddings)
    'LinearSVC': (
        LinearSVC(dual="auto", max_iter=2000, random_state=42),
        {'C': [0.1, 1.0]}
    ),

    # 3. Neural Network (Captures complex relationships in vectors)
    'MLP_Net': (
        MLPClassifier(max_iter=500, random_state=42),
        {'hidden_layer_sizes': [(100,), (100, 50)]} # Try 1 layer vs 2 layers
    ),

    # 4. Random Forest (Tree-based, good for non-linear data)
    'RandomForest': (
        RandomForestClassifier(n_estimators=100, random_state=42),
        {'max_depth': [10, 20]}
    )
}

# Run the experiments
modeling.run_model_experiments(embedding_datasets, models_emb_config)


FINAL RESULTS: Training Multiple Classifiers

MODEL: LogReg   |   DATASET: Local_W2V
Using GridSearchCV...
Best params: {'C': 10.0}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.7722    0.6711    0.7181       894
Positive (1)     0.8677    0.9160    0.8912      2106

    accuracy                         0.8430      3000
   macro avg     0.8200    0.7935    0.8047      3000
weighted avg     0.8393    0.8430    0.8396      3000

Time: 9.91s

MODEL: LinearSVC   |   DATASET: Local_W2V
Using GridSearchCV...
Best params: {'C': 1.0}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.7763    0.6600    0.7134       894
Positive (1)     0.8643    0.9193    0.8909      2106

    accuracy                         0.8420      3000
   macro avg     0.8203    0.7896    0.8022      3000
weighted avg     0.8381    0.8420    0.8380      3000

Time: 4.25s

MODEL: MLP_Net   |   DATASET: Local_W2V
U



Best params: {'hidden_layer_sizes': (100,)}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.7317    0.6253    0.6743       894
Positive (1)     0.8502    0.9027    0.8756      2106

    accuracy                         0.8200      3000
   macro avg     0.7909    0.7640    0.7750      3000
weighted avg     0.8149    0.8200    0.8156      3000

Time: 233.33s

MODEL: RandomForest   |   DATASET: Local_W2V
Using GridSearchCV...
Best params: {'max_depth': 20}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.7702    0.5213    0.6217       894
Positive (1)     0.8213    0.9340    0.8740      2106

    accuracy                         0.8110      3000
   macro avg     0.7958    0.7276    0.7479      3000
weighted avg     0.8061    0.8110    0.7988      3000

Time: 80.67s

MODEL: LogReg   |   DATASET: Local_FastText
Using GridSearchCV...
Best params: {'C': 10.0}

--- TEST SET Evaluation -



Best params: {'hidden_layer_sizes': (100,)}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.8086    0.6096    0.6952       894
Positive (1)     0.8500    0.9387    0.8921      2106

    accuracy                         0.8407      3000
   macro avg     0.8293    0.7742    0.7937      3000
weighted avg     0.8376    0.8407    0.8334      3000

Time: 267.82s

MODEL: RandomForest   |   DATASET: Pre_GloVe_100d
Using GridSearchCV...
Best params: {'max_depth': 20}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.8282    0.4206    0.5579       894
Positive (1)     0.7965    0.9630    0.8719      2106

    accuracy                         0.8013      3000
   macro avg     0.8124    0.6918    0.7149      3000
weighted avg     0.8060    0.8013    0.7783      3000

Time: 70.92s

MODEL: LogReg   |   DATASET: Pre_W2V_GoogleNews
Using GridSearchCV...
Best params: {'C': 10.0}

--- TEST SET Eva



Best params: {'hidden_layer_sizes': (100,)}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.7790    0.7215    0.7491       894
Positive (1)     0.8854    0.9131    0.8990      2106

    accuracy                         0.8560      3000
   macro avg     0.8322    0.8173    0.8241      3000
weighted avg     0.8537    0.8560    0.8544      3000

Time: 451.50s

MODEL: RandomForest   |   DATASET: Pre_W2V_GoogleNews
Using GridSearchCV...
Best params: {'max_depth': 20}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.8656    0.4754    0.6137       894
Positive (1)     0.8131    0.9687    0.8841      2106

    accuracy                         0.8217      3000
   macro avg     0.8393    0.7220    0.7489      3000
weighted avg     0.8287    0.8217    0.8035      3000

Time: 119.62s

MODEL: LogReg   |   DATASET: Pre_FastText_Wiki
Using GridSearchCV...
Best params: {'C': 10.0}

--- TEST SET



Best params: {'hidden_layer_sizes': (100,)}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.7805    0.7360    0.7576       894
Positive (1)     0.8906    0.9122    0.9012      2106

    accuracy                         0.8597      3000
   macro avg     0.8356    0.8241    0.8294      3000
weighted avg     0.8578    0.8597    0.8584      3000

Time: 542.72s

MODEL: RandomForest   |   DATASET: Pre_FastText_Wiki
Using GridSearchCV...
Best params: {'max_depth': 20}

--- TEST SET Evaluation ---
              precision    recall  f1-score   support

Negative (0)     0.8661    0.4485    0.5910       894
Positive (1)     0.8057    0.9706    0.8805      2106

    accuracy                         0.8150      3000
   macro avg     0.8359    0.7096    0.7357      3000
weighted avg     0.8237    0.8150    0.7942      3000

Time: 122.97s
