## Inits

In [None]:
!pip install xgboost tqdm tensorflow



In [None]:
!rm glove.6B.zip*
!wget -O glove.6B.zip https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip glove.6B.zip

--2025-05-22 08:36:11--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-05-22 08:38:50 (5.18 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Activation, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.optimizers import AdamW
from sklearn.preprocessing import LabelBinarizer

from xgboost import XGBClassifier
import tempfile
import re
from tqdm import tqdm, trange
import os
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Classes

In [None]:
class TextPreprocessor:
    def __init__(self, text_column):
        self.text_column = text_column
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r"[^a-z0-9\s]", "", text)
        tokens = text.split()
        tokens = [word for word in tokens if word not in self.stop_words]
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]
        return " ".join(tokens)

    def process(self, df):
        df = df.copy()
        df[self.text_column] = df[self.text_column].astype(str).apply(self.clean_text)
        return df

class XGBoostTFIDFClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()
        self.model = XGBClassifier(eval_metric='logloss')

    def fit(self, X_train, y_train):
        X_tfidf = self.vectorizer.fit_transform(X_train)
        self.model.fit(X_tfidf, y_train)

    def predict(self, X_test):
        X_tfidf = self.vectorizer.transform(X_test)
        return self.model.predict(X_tfidf)

class LSTMEmbeddingClassifier:
    def __init__(self, max_words=10000, max_len=200, embed_dim=200, glove_path='glove.6B.200d.txt'):
        self.max_words = max_words
        self.max_len = max_len
        self.embed_dim = embed_dim
        self.glove_path = glove_path
        self.tokenizer = Tokenizer(num_words=self.max_words, oov_token="<OOV>")
        self.model = None
        self.lb = LabelBinarizer()
        self.embedding_matrix = None

    def load_glove_embeddings(self):
        embeddings_index = {}
        with open(self.glove_path, encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coeffs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coeffs

        word_index = self.tokenizer.word_index
        embedding_matrix = np.zeros((self.max_words, self.embed_dim))
        for word, i in word_index.items():
            if i >= self.max_words:
                continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        self.embedding_matrix = embedding_matrix

    def fit(self, X_train, y_train):
        self.tokenizer.fit_on_texts(X_train)
        X_seq = self.tokenizer.texts_to_sequences(X_train)
        X_pad = pad_sequences(X_seq, maxlen=self.max_len)
        y_encoded = self.lb.fit_transform(y_train)

        self.load_glove_embeddings()

        self.model = Sequential([
            Embedding(input_dim=self.max_words, output_dim=self.embed_dim,
                      weights=[self.embedding_matrix], trainable=True),
            Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.2)),
            Dropout(0.3),
            Bidirectional(LSTM(32, return_sequences=False, recurrent_dropout=0.2)),
            Dropout(0.5),
            Dense(64),
            BatchNormalization(),
            Activation('relu'),
            Dropout(0.5),
            Dense(len(self.lb.classes_), activation='softmax')
        ])

        optimizer = AdamW(learning_rate=3e-4, weight_decay=5e-4)
        self.model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        self.model.fit(
            X_pad, y_encoded,
            epochs=50,
            batch_size=64,
            validation_split=0.2,
            callbacks=[
                EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
            ],
            verbose=1
        )

    def predict(self, X_test):
        X_seq = self.tokenizer.texts_to_sequences(X_test)
        X_pad = pad_sequences(X_seq, maxlen=self.max_len)
        preds = self.model.predict(X_pad, verbose=0)
        y_pred = self.lb.inverse_transform(preds)
        return y_pred

class TextClassifierFramework:
    def __init__(self, model_type='xgboost'):
        self.model_type = model_type
        self.model = None
        self.text_column = None
        self.target_column = None
        self.preprocessor = None
        self.label_encoder = None
        self.y_train_str = None
        self.y_test_str = None

    def load_data(self, df, text_column, target_column, test_size=0.2, random_state=None):
        self.text_column = text_column
        self.target_column = target_column
        self.preprocessor = TextPreprocessor(text_column)
        self.label_encoder = LabelEncoder()

        df = self.preprocessor.process(df)

        X_train, X_test, y_train_str, y_test_str = train_test_split(
            df[text_column], df[target_column],
            test_size=test_size, random_state=random_state, stratify=df[target_column]
        )

        self.y_train_str = y_train_str.reset_index(drop=True)
        self.y_test_str = y_test_str.reset_index(drop=True)

        y_train = self.label_encoder.fit_transform(self.y_train_str)
        y_test = self.label_encoder.transform(self.y_test_str)

        if self.model_type == 'xgboost':
            self.model = XGBoostTFIDFClassifier()
        elif self.model_type == 'lstm':
            self.model = LSTMEmbeddingClassifier()
        else:
            raise ValueError("Unsupported model type")

        return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def evaluate(self, X_test, y_test):
        preds = self.model.predict(X_test)
        if isinstance(preds[0], str):
            y_test_str = self.y_test_str
            preds_str = preds
        else:
            y_test_str = self.label_encoder.inverse_transform(y_test)
            preds_str = self.label_encoder.inverse_transform(preds)

        acc = accuracy_score(y_test_str, preds_str)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test_str, preds_str, average='macro', zero_division=0)
        report = classification_report(y_test_str, preds_str, output_dict=True, zero_division=0)
        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'classification_report': report,
            'y_true': list(y_test_str),
            'y_pred': list(preds_str)
        }

class RepeatedEvaluator:
    def __init__(self, df, text_column, target_column, model_type='xgboost', runs=10):
        self.df = df
        self.text_column = text_column
        self.target_column = target_column
        self.model_type = model_type
        self.runs = runs
        self.results = []

    def run(self):
        for run in range(self.runs):
            print(f"Run {run + 1}/{self.runs}")
            clf = TextClassifierFramework(model_type=self.model_type)
            X_train, X_test, y_train, y_test = clf.load_data(
                self.df, self.text_column, self.target_column, random_state=run
            )
            clf.train(X_train, y_train)
            eval_result = clf.evaluate(X_test, y_test)
            self.results.append(eval_result)
        return self.results

## Import Dataset

In [None]:
shared_link = '*insert file sharing link from your local google drive, or import the csv another way*'
file_id = re.search(r'/d/(.*?)/', shared_link).group(1)
download_url = f'https://drive.google.com/uc?id={file_id}'
df = pd.read_csv(download_url)

## Running

In [None]:
evaluator = RepeatedEvaluator(
    df=df,
    text_column='rawText',
    target_column='Reviewer_Location',
    model_type='xgboost',  # xgboost or lstm
    runs=1
)

results = evaluator.run()

Run 1/1


## Results Analysis

In [None]:
accuracies = [r['accuracy'] for r in results]
print("Mean Accuracy:", np.mean(accuracies))

Mean Accuracy: 0.5055


In [None]:
#with open("raw_xgboost.pkl", "wb") as f:
#    pickle.dump(results, f)