# Bag-of-Words Classifier Pipeline

In [18]:
import os
from collections import OrderedDict

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import xgboost as xgb

from utils import print_gridsearch_results, test_on_estimator, plot_cv_single_param

RANDOM_STATE = 123

## Data prep

In [25]:
data_dir = '../data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

print("Shape of data\n---------------")
print(f"x_train_df shape: {x_train_df.shape} ")
print(f"y_train_df shape: {y_train_df.shape} ")

# Get the text as a list of strings
x_train_text = x_train_df['text'].values
y_train = y_train_df['is_positive_sentiment'].values

Shape of data
---------------
x_train_df shape: (2400, 2) 
y_train_df shape: (2400, 1) 


## Dataset Exploration

In [26]:
num_train_samples = x_train_text.shape[0]
num_positive_train_samples = np.count_nonzero(y_train == 1)
fraction_positive_train = float(num_positive_train_samples) / float(num_train_samples)


print(f"Total number of training samples = {num_train_samples}")
print(f"Fraction positive training samples = {fraction_positive_train}")


Total number of training samples = 2400
Fraction positive training samples = 0.5


## Basic comparison of two vectorizers - one with counts and one using tfidf

In [29]:
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit_transform(x_train_text)
x_train_text_count = count_vectorizer.transform(x_train_text).toarray()
# count_vectorizer.vocabulary_

In [30]:
x_train_text_count.shape

(2400, 4255)

## Cross validation 

In [14]:
cv_splitter = sklearn.model_selection.StratifiedKFold(n_splits=5)
#cv_splitter.get_n_splits(X, y)

## XGBoost with BoW

In [10]:
bow_X_train, bow_X_test, bow_y_train, bow_y_test = train_test_split(x_train_text_count, y_train, test_size=480, random_state=1234)

In [11]:
xgboost_tree = xgb.XGBClassifier()

In [13]:
xgboost_tree.fit(bow_X_train, bow_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
# make predictions for test data
from sklearn.metrics import balanced_accuracy_score

y_pred = xgboost_tree.predict(bow_X_test)
predictions = [round(value) for value in y_pred]
accuracy = balanced_accuracy_score(bow_y_test, predictions)
accuracy

0.7677715238690849

## XGBoost with Word Embeddings

In [27]:


zip_file_path = os.path.join('..',
    'pretrained_embedding_vectors/',
    'glove.6B.50d.txt.zip')

word_embeddings = pd.read_csv(
    zip_file_path,
    header=None, sep=' ', index_col=0,
    nrows=100000, compression='zip', encoding='utf-8', quoting=3)

# Build a dict that will map from string word to 50-dim vector
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

In [28]:
import re
from sklearn.feature_extraction import text

x_train_embeddings = np.zeros((num_train_samples, 50))

for ind in range(num_train_samples):
    sample = x_train_text[ind]
    stripped_sample = re.sub(r'[^\w\s]', '', sample).lower()
    sample_embedding = []
    #print(f"Current sample = {stripped_sample}")
    for word in stripped_sample.split(' '):
        if word in word2vec.keys() and word not in text.ENGLISH_STOP_WORDS:
            sample_embedding.append(word2vec[word])
    
    if len(sample_embedding) == 0:
        sample_embedding = [0] * 50
        
    sample_embedding = np.array(sample_embedding)
    avg_sample_embedding = np.nanmean(sample_embedding, axis=0)
    x_train_embeddings[ind] = avg_sample_embedding
x_train_embeddings.shape
y_train.shape

(2400,)

In [42]:
emb_X_train, emb_X_test, emb_y_train, emb_y_test = train_test_split(x_train_embeddings, y_train, test_size=480, random_state=1234)

In [43]:
xgboost_tree_emb = xgb.XGBClassifier()

In [44]:
xgboost_tree_emb.fit(emb_X_train, emb_y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [46]:
 emb_X_test.shape

(480, 50)

In [47]:
emb_y_test.shape

(480,)

In [48]:
# make predictions for test data
y_pred = xgboost_tree_emb.predict(emb_X_test)
predictions = [round(value) for value in y_pred]
accuracy = balanced_accuracy_score(emb_y_test, predictions)
accuracy

0.7634980195955806

# XGBoost with Pipeline

In [6]:
count_vectorizer = CountVectorizer(stop_words='english')

# Static Parameters based on earlier grid_search 
count_vectorizer.set_params(min_df = 2, ngram_range = (1,2))

# Set the tuple for the pipeline
count_pipeline_tuple = ("count", count_vectorizer)

In [7]:
tf_vectorizer = TfidfVectorizer(stop_words='english')

# Static Parameters based on earlier grid_search 
tf_vectorizer.set_params(min_df = 2, ngram_range = (1,2))

# Set the tuple for the pipeline
tf_pipeline_tuple = ("tf", tf_vectorizer)

In [31]:
xgb_classifier = xgb.XGBClassifier()
# XGBoost hyperparameter grid
xgb_parameters = {
    "xgb__max_depth": [3, 6, 12],
    "xgb__n_estimxators": [100,200, 500],
}
xgb_pipeline_tuple = ('xgb', xgb_classifier)

In [15]:
tf_xgb_pipeline = Pipeline([
    tf_pipeline_tuple,
    xgb_pipeline_tuple,
 ])
tf_xgb_full_grid = { 
#     **tf_parameters,
    **xgb_parameters
}


tf_xgb_grid_searcher = GridSearchCV(
    tf_xgb_pipeline, 
    tf_xgb_full_grid, 
    cv=cv_splitter, 
    n_jobs=4,  
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
tf_xgb_grid_searcher.fit(x_train_text, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   11.8s finished


Parameters: { n_estimxators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('tf',
                                        TfidfVectorizer(min_df=2,
                                                        ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_c...
       

In [20]:
print_gridsearch_results(tf_xgb_grid_searcher, list(tf_xgb_full_grid.keys()))

Dataframe has shape: (6, 22)
Number of trials used in grid search:  6


Unnamed: 0,param_xgb__max_depth,param_xgb__n_estimxators,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
2,6,100,0.870625,0.754167,0.954478,1
3,6,200,0.870625,0.754167,0.939046,1
0,3,100,0.833021,0.7525,0.645268,3
1,3,200,0.833021,0.7525,0.62444,3
4,12,100,0.910729,0.740417,1.35152,5
5,12,200,0.910729,0.740417,1.200302,5


In [21]:
count_xgb_pipeline = Pipeline([
    count_pipeline_tuple,
    xgb_pipeline_tuple,
 ])
count_xgb_full_grid = { 
#     **count_parameters,
    **xgb_parameters
}


count_xgb_grid_searcher = GridSearchCV(
    count_xgb_pipeline, 
    count_xgb_full_grid, 
    cv=cv_splitter, 
    n_jobs=4,  
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
count_xgb_grid_searcher.fit(x_train_text, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    6.9s finished


Parameters: { n_estimxators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('count',
                                        CountVectorizer(min_df=2,
                                                        ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interactio...
       

In [23]:
print_gridsearch_results(count_xgb_grid_searcher, list(count_xgb_full_grid.keys()))

Dataframe has shape: (6, 22)
Number of trials used in grid search:  6


Unnamed: 0,param_xgb__max_depth,param_xgb__n_estimxators,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
2,6,100,0.845625,0.76375,0.772862,1
3,6,200,0.845625,0.76375,0.66586,1
4,12,100,0.86875,0.755833,0.961692,3
5,12,200,0.86875,0.755833,0.86955,3
0,3,100,0.808333,0.750417,0.467756,5
1,3,200,0.808333,0.750417,0.49057,5


In [32]:
embedding_xgb_pipeline = Pipeline([
    #count_pipeline_tuple,
    xgb_pipeline_tuple,
 ])
embedding_xgb_full_grid = { 
#     **count_parameters,
    **xgb_parameters
}


embedding_xgb_grid_searcher = GridSearchCV(
    embedding_xgb_pipeline, 
    embedding_xgb_full_grid, 
    cv=cv_splitter, 
    n_jobs=4,  
    verbose=3,
    scoring='balanced_accuracy',
    return_train_score=True
)
embedding_xgb_grid_searcher.fit(x_train_embeddings, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   12.5s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   29.8s finished


Parameters: { n_estimxators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None,
                                                      max_depth=None,
                                           

In [33]:
print_gridsearch_results(embedding_xgb_grid_searcher, list(embedding_xgb_full_grid.keys()))

Dataframe has shape: (9, 22)
Number of trials used in grid search:  9


Unnamed: 0,param_xgb__max_depth,param_xgb__n_estimxators,mean_train_score,mean_test_score,mean_fit_time,rank_test_score
3,6,100,0.996354,0.744167,2.48011,1
4,6,200,0.996354,0.744167,3.020294,1
5,6,500,0.996354,0.744167,2.831381,1
6,12,100,0.996354,0.73875,3.555784,4
7,12,200,0.996354,0.73875,3.540635,4
8,12,500,0.996354,0.73875,3.126286,4
0,3,100,0.978437,0.73625,1.28203,7
1,3,200,0.978437,0.73625,1.258519,7
2,3,500,0.978437,0.73625,1.199993,7
