In [None]:
import feather
import xgboost
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import joblib
import re
import pickle
stop_words = set(stopwords.words('english'))

In [None]:
df_rev_balanced = feather.read_dataframe("../../data/balanced_reviews.feather")

In [None]:
df_rev_balanced.head()

# Model Training Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [None]:
import nltk
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('text')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('len')),
            ('wscaler', StandardScaler()),
        ])),
    ])),model
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
])

In [None]:
X = df_rev_balanced[['text', 'len']]
Y = df_rev_balanced['stars'].apply(lambda x: 1 if x > 3 else 0) 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=9)

In [None]:
model = classifier.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
print(f"Accuracy: {round((predictions == y_test).mean() * 100, 2)}%")

# Export model

In [None]:
!mkdir -p ../../assets/sentiment_xgboost

In [None]:
 Save the pipeline
joblib.dump(model, "../../assets/sentiment_xgboost/model.joblib.dat")

In [None]:
# Save the xgboost booster
classifier = model.steps[1][1]
classifier._Booster.save_model("../../assets/sentiment_xgboost/classifier.model")

# Load Model

In [None]:
import joblib
model = joblib.load('../../assets/sentiment_xgboost/model.joblib.dat')

# Manual Testing

In [None]:
samples = ["I love my mother"]
import pandas as pd
sample_X = pd.DataFrame()
sample_X['text'] = samples
sample_X['len'] = [len(x.split()) for x in samples]
# sample_X.head()
["Positive" if prediction == 1 else "Negative" for prediction in model.predict(sample_X)]