# Bag-of-Words Classifier Pipeline

In [37]:
import os

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import xgboost as xgb

RANDOM_STATE = 123

## Data prep

In [38]:
data_dir = '../data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Dataset Exploration

In [39]:
num_train_samples = x_train_text.shape[0]
num_positive_train_samples = np.count_nonzero(y_train == 1)
fraction_positive_train = float(num_positive_train_samples) / float(num_train_samples)


print(f"Total number of training samples = {num_train_samples}")
print(f"Fraction positive training samples = {fraction_positive_train}")


Total number of training samples = 2400
Fraction positive training samples = 0.5


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [29]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()
# count_vectorizer.vocabulary_

In [30]:
x_train_text_count.shape

(2400, 4255)

## Cross validation 

In [6]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)
#cv_splitter.get_n_splits(X, y)

## XGBoost with BoW

In [10]:
from sklearn.model_selection import train_test_split
bow_X_train, bow_X_test, bow_y_train, bow_y_test = train_test_split(x_train_text_count, y_train, test_size=480, random_state=1234)

In [11]:
xgboost_tree = xgb.XGBClassifier()

In [13]:
xgboost_tree.fit(bow_X_train, bow_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
# make predictions for test data
from sklearn.metrics import balanced_accuracy_score

y_pred = xgboost_tree.predict(bow_X_test)
predictions = [round(value) for value in y_pred]
accuracy = balanced_accuracy_score(bow_y_test, predictions)
accuracy

0.7677715238690849

## XGBoost with Word Embeddings

In [40]:
from collections import OrderedDict

zip_file_path = os.path.join('..',
    'pretrained_embedding_vectors/',
    'glove.6B.50d.txt.zip')

word_embeddings = pd.read_csv(
    zip_file_path,
    header=None, sep=' ', index_col=0,
    nrows=100000, compression='zip', encoding='utf-8', quoting=3)

# Build a dict that will map from string word to 50-dim vector
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

In [41]:
import re
from sklearn.feature_extraction import text

x_train_embeddings = np.zeros((num_train_samples, 50))

for ind in range(num_train_samples):
    sample = x_train_text[ind]
    stripped_sample = re.sub(r'[^\w\s]', '', sample).lower()
    sample_embedding = []
    #print(f"Current sample = {stripped_sample}")
    for word in stripped_sample.split(' '):
        if word in word2vec.keys() and word not in text.ENGLISH_STOP_WORDS:
            sample_embedding.append(word2vec[word])
    
    if len(sample_embedding) == 0:
        sample_embedding = [0] * 50
        
    sample_embedding = np.array(sample_embedding)
    avg_sample_embedding = np.nanmean(sample_embedding, axis=0)
    x_train_embeddings[ind] = avg_sample_embedding
x_train_embeddings.shape
y_train.shape

(2400,)

In [42]:
emb_X_train, emb_X_test, emb_y_train, emb_y_test = train_test_split(x_train_embeddings, y_train, test_size=480, random_state=1234)

In [43]:
xgboost_tree_emb = xgb.XGBClassifier()

In [44]:
xgboost_tree_emb.fit(emb_X_train, emb_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [46]:
 emb_X_test.shape

(480, 50)

In [47]:
emb_y_test.shape

(480,)

In [48]:
# make predictions for test data
y_pred = xgboost_tree_emb.predict(emb_X_test)
predictions = [round(value) for value in y_pred]
accuracy = balanced_accuracy_score(emb_y_test, predictions)
accuracy

0.7634980195955806