In [None]:
# Install dependencies: pip install -r requirements.txt

In [None]:
import pandas as pd
import swifter
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import nltk

import data_loader
import preprocessor
import visualization
import modeling
import ontology_manager
import embeddings_handler

In [None]:
# df = pd.read_json("/content/mini_hotel_reviews.json", lines=True)
df = data_loader.load_mini_dataset("mini_hotel_reviews.json")

In [None]:
# # Cell 2: Load Data
# df = data_loader.load_and_create_mini_dataset(
#     file_path='hotel_reviews.json',
#     mini_file_path='mini_hotel_reviews.json'
# )

In [None]:
# Cell 3: Preprocessing (NER, POS, Cleaning)
# # Note: swifter speeds up the apply function
# print("Applying NER, POS, and Cleaning...")
# df['ner_text'] = df['text'].swifter.apply(preprocessor.replace_entities)
# df['pos_text'] = df['text'].swifter.apply(preprocessor.replace_with_pos)

nltk.download('punkt_tab')
df['clean_tokens'] = df['text'].swifter.apply(preprocessor.preprocess_text)
# df['clean_tokens_ner'] = df['ner_text'].swifter.apply(preprocessor.preprocess_text)
# df['clean_tokens_pos'] = df['pos_text'].swifter.apply(preprocessor.preprocess_text)

In [None]:
# # Cell 4: N-Grams creation
# df['bigrams'] = df['clean_tokens'].swifter.apply(lambda x: preprocessor.generate_ngrams(x, 2))
# df['trigrams'] = df['clean_tokens'].swifter.apply(lambda x: preprocessor.generate_ngrams(x, 3))

In [None]:
# # Cell 5: Stemming
# df['porter_stem'] = df['clean_tokens'].swifter.apply(preprocessor.get_porter_tokens)
# df['snowball_stem'] = df['clean_tokens'].swifter.apply(preprocessor.get_snowball_tokens)
# df['lancaster_stem'] = df['clean_tokens'].swifter.apply(preprocessor.get_lancaster_tokens)

In [None]:
# Define Target Class
# df['class'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0)
df = preprocessor.create_target_class(df)

In [None]:
# # Cell 6: Visualization (EDA)
# print("Visualizing Top Unigrams:")
# visualization.plot_top_ngrams(df['clean_tokens'], "Unigrams")

# print("Visualizing Top Bigrams:")
# visualization.plot_top_ngrams(df['bigrams'], "Bigrams", palette="plasma")

In [None]:
# # Cell 7: Feature Engineering (Vectorization)
# token_cols = ['clean_tokens', 'clean_tokens_ner', 'clean_tokens_pos',
#               'porter_stem', 'snowball_stem']
# ngram_settings = [(1, 1), (1, 2)] # reduced for speed, add (2,2) or (3,3) if needed

# vectorized_data = modeling.vectorize_all_datasets(df, token_cols, ngram_settings)

In [None]:
# # Cell 8: Model Training & GridSearch
# # Define models
# models_config = {
#     'DecisionTree': (DecisionTreeClassifier(random_state=42), {'max_depth': [10, 20]}),
#     'LogisticRegression': (LogisticRegression(max_iter=1000), {'C': [0.1, 1.0]}),
#     'MLP_NeuralNet': (MLPClassifier(max_iter=500), {'hidden_layer_sizes': [(50,)]})
# }

# modeling.run_model_experiments(vectorized_data, models_config)

In [None]:
# # Cell 9: Ontology Definition & Population
# ontology_manager.define_ontology()
# # We use head(50) to save time, remove .head(50) to do full dataset
# populated_onto = ontology_manager.populate_ontology(df.head(50))

In [None]:
# # Cell 10: Ontology Visualization
# dot_graph = ontology_manager.visualize_ontology_structure()
# dot_graph # This will display the graph in the notebook

In [None]:
import os
import zipfile
import urllib.request
import gzip
import shutil
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# SETUP: PRE-TRAINED MODEL DOWNLOADER
def download_file(url, filename):
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filename)
        print("Download complete!")
    else:
        print(f"Found {filename}, skipping download.")

# EXPERIMENT SETUP
embedding_datasets = {}
handlers = {}

# --- 1. LOCAL MODELS (Trained on YOUR Hotel Data) ---
print("\n" + "="*60)
print("PHASE 1: Local Training (Domain Specific)")
print("="*60)

# Local Word2Vec
print("Training Local Word2Vec...")
h_w2v = embeddings_handler.EmbeddingsHandler()
h_w2v.train_word2vec(df['clean_tokens'], vector_size=100, min_count=2)
handlers['Local_W2V'] = h_w2v
X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_w2v)
embedding_datasets['Local_W2V'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Local FastText
print("Training Local FastText...")
h_ft = embeddings_handler.EmbeddingsHandler()
h_ft.train_fasttext(df['clean_tokens'], vector_size=100, min_count=2)
handlers['Local_FT'] = h_ft
X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_ft)
embedding_datasets['Local_FastText'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 2. PRE-TRAINED GLOVE (Wikipedia + Gigaword) ---
print("\n" + "="*60)
print("PHASE 2: Pre-trained GloVe (Wikipedia)")
print("="*60)

download_file("http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")
if not os.path.exists("glove.6B.100d.txt"):
    with zipfile.ZipFile("glove.6B.zip", 'r') as z:
        z.extract("glove.6B.100d.txt")

h_glove = embeddings_handler.EmbeddingsHandler()
h_glove.load_glove("glove.6B.100d.txt")
handlers['Pre_GloVe'] = h_glove

X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_glove)
embedding_datasets['Pre_GloVe_100d'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 3. PRE-TRAINED WORD2VEC (Google News - 300 dimensions) ---
print("\n" + "="*60)
print("PHASE 3: Pre-trained Word2Vec (Google News 100 Billion Words)")
print("="*60)
# Note: This is a 1.5GB download
gn_url = "https://figshare.com/ndownloader/files/10798046"
gn_path = "GoogleNews-vectors-negative300.bin"

download_file(gn_url, gn_path)

h_gn = embeddings_handler.EmbeddingsHandler()
# Manually loading because it's a specific binary format
print("Loading GoogleNews Model (This takes a moment)...")
h_gn.model = KeyedVectors.load_word2vec_format(gn_path, binary=True)
h_gn.vector_size = 300  # Google News is 300d
handlers['Pre_GoogleNews'] = h_gn

X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_gn)
embedding_datasets['Pre_W2V_GoogleNews'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 4. PRE-TRAINED FASTTEXT (Wiki News - 300 dimensions) ---
print("\n" + "="*60)
print("PHASE 4: Pre-trained FastText (Wiki News)")
print("="*60)

ft_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
ft_zip = "wiki-news-300d-1M.vec.zip"
ft_vec = "wiki-news-300d-1M.vec"

download_file(ft_url, ft_zip)
if not os.path.exists(ft_vec):
    print("Extracting FastText...")
    with zipfile.ZipFile(ft_zip, 'r') as z:
        z.extract(ft_vec)

h_pre_ft = embeddings_handler.EmbeddingsHandler()
print("Loading Pre-trained FastText...")
# Standard .vec format is text-based (binary=False)
h_pre_ft.model = KeyedVectors.load_word2vec_format(ft_vec, binary=False)
h_pre_ft.vector_size = 300
handlers['Pre_FastText'] = h_pre_ft

X, y = modeling.prepare_embedding_dataset(df, 'clean_tokens', h_pre_ft)
embedding_datasets['Pre_FastText_Wiki'] = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)


# --- 5. SEMANTIC COMPARISON ---
print("\n" + "="*60)
print("SEMANTIC COMPARISON: Local vs Global")
print("="*60)
test_word = "staff"
print(f"Neighbors for '{test_word}':")
for name, h in handlers.items():
    neighbors = h.get_semantic_neighbors(test_word, topn=3)
    print(f"{name:>15}: {neighbors}")

In [None]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

print("\n" + "="*60)
print("FINAL RESULTS: Training Multiple Classifiers")
print("="*60)

# Get model configurations
models_emb_config = modeling.get_embedding_models_config()

# Run the experiments
modeling.run_model_experiments(embedding_datasets, models_emb_config)