In [126]:
import feather
import xgboost
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import joblib
import re
import pickle
stop_words = set(stopwords.words('english'))

In [127]:
df_rev_balanced = feather.read_dataframe("../../data/balanced_reviews.feather")

In [43]:
df_rev_balanced.head()

Unnamed: 0,level_1,stars,review_id,user_id,business_id,useful,funny,cool,text,date,len
0,35,2,wOv7JiO0EGcJ61P2JcDS1g,en0xQXlvRk-ZtKlaW4I8eQ,N0apJkxIem2E8irTBRKnHw,0,0,0,Went here last weekend and was pretty disappoi...,2015-01-18 15:30:50,815
1,57,2,UIvFkXLBxWxM6aekvaCp8Q,0FMte0z-repSVWSJ_BaQTg,WsdmzI2giWHcRN2plprxIg,5,14,4,Took my kids here to hang out on one fine 72 d...,2013-05-28 21:06:57,2072
2,75,5,qdQIIf6xuyubxEG05e02TA,XPZVfP7DQCSL3Nb9t2vxsA,1HD5iUUfVJDbfEBIn9yVhw,11,8,11,Yes... the Boba Tea explosion is in full force...,2017-03-15 02:02:13,1012
3,100,1,h7Rmb3EiXjajVfGYNa5CuQ,oAOE4UAC5ZbAjEGBEMCb4g,PycR_Mr5jA9jB4Xg3nX0Yw,14,1,1,They charged me twice what I expected to pay. ...,2014-01-17 02:15:25,709
4,105,5,VRlBw70YHxje2n42IEOw6w,n5lEgdrkMlQd0_myn51j9g,01o6K5ID_vW8tXZ7QAzPJg,0,0,0,PRC consulted on 2 tile roof replacements at a...,2017-11-06 17:58:01,442


# Model Training Pipeline

In [45]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [46]:
import nltk
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [47]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('text')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('len')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
])

In [48]:
X = df_rev_balanced[['text', 'len']]
Y = df_rev_balanced['stars'].apply(lambda x: 1 if x > 3 else 0) 

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=9)

In [50]:
model = classifier.fit(X_train, y_train)

  'stop_words.' % sorted(inconsistent))


In [52]:
predictions = model.predict(X_test)

In [118]:
print(f"Accuracy: {round((predictions == y_test).mean() * 100, 2)}%")

Accuracy: 83.57%


# Manual Testing

In [124]:
samples = ["I love my mother"]
import pandas as pd
sample_X = pd.DataFrame()
sample_X['text'] = samples
sample_X['len'] = [len(x.split()) for x in samples]
# sample_X.head()
["Positive" if prediction == 1 else "Negative" for prediction in model.predict(sample_X)]

['Positive']

# Export model

In [78]:
!mkdir -p ../../assets/sentiment_xgboost

In [79]:
joblib.dump(model, "../../assets/sentiment_xgboost/model.joblib.dat")

['../../assets/sentiment_xgboost/model.joblib.dat']

In [114]:
# Save the pipeline
pickle.dump(model, open("../../assets/sentiment_xgboost/model.pickle.dat", "wb"))

In [125]:
# Save the xgboost booster
classifier = model.steps[1][1]
classifier._Booster.save_model("../../assets/sentiment_xgboost/classifier.model")