# Model Prototyping

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
np.random.seed(103)

# Model Training

In [3]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
def tokenizer_lemma(sentence):
    return [token.lemma_ for token in nlp(sentence)]

In [6]:
def tokenizer(sentence):
    return sentence.split()

In [7]:
stop = list(spacy.lang.en.stop_words.STOP_WORDS)

## Load the Data

In [8]:
df = pd.read_csv("../datalake/feed/version_2020-04-04.csv")

In [9]:
df.head()

Unnamed: 0,_id,review_text,pruned_rating
0,A1ZRXGT8QJXGET,easily installed on my parlor guitar the oval ...,2
1,AOY459LVUBKLO,so far still made in america i have been using...,2
2,A34WEXT7SIRFE4,these cans were decent back when they were jus...,0
3,A2N7F3MVCTAOYP,i had purchased one a while back for my electr...,2
4,A1LH6RF4UN9VI6,i ve tried these out in head to head excuse th...,2


In [10]:
df.groupby(["pruned_rating"]).size()

pruned_rating
0    9015
1    9015
2    9015
dtype: int64

In [11]:
df.shape

(27045, 3)

In [12]:
# def train_test_split(input_df, pct):
#     train_rows = int(input_df.shape[0]*(1-pct))
#     x_train = df.loc[:train_rows, 'review_text'] #.values
#     y_train = df.loc[:train_rows, 'pruned_rating']#.values
#     x_test = df.loc[train_rows:, 'review_text']#.values
#     y_test = df.loc[train_rows:, 'pruned_rating']#.values
#     return x_train, y_train, x_test, y_test

# X_train, y_train, X_test, y_test = train_test_split(df, 0.3)

In [13]:
# y_train.value_counts()

In [14]:
def train_test_split(input_df, pct):
    train_rows = int(input_df.shape[0]*(1-pct))
    x_train = df.loc[:train_rows, 'review_text'].values
    y_train = df.loc[:train_rows, 'pruned_rating'].values
    x_test = df.loc[train_rows:, 'review_text'].values
    y_test = df.loc[train_rows:, 'pruned_rating'].values
    return x_train, y_train, x_test, y_test

In [15]:
X_train, y_train, X_test, y_test = train_test_split(df, 0.3)
# X_train, y_train, X_test, y_test = train_test_split(df, 0.95)

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.svm import SVC

In [17]:
tfidf = TfidfVectorizer(
    strip_accents=None,
    lowercase=False,
    preprocessor=None
)

scorer = make_scorer(score_func=f1_score, greater_is_better=True, average="micro")

In [18]:
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [
    {'vect__ngram_range': [(1, 1),(1, 2)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_lemma],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': param_range},
    {'vect__ngram_range': [(1, 1),(1, 2)],
     'vect__stop_words': [stop, None],
     'vect__tokenizer': [tokenizer, tokenizer_lemma],
     'vect__use_idf':[False],
     'vect__norm':[None],
     'clf__penalty': ['l1', 'l2'],
     'clf__C': param_range},
]

In [19]:
lr_tfidf = Pipeline(
    [('vect', tfidf),
     ('clf', LogisticRegression(random_state=1, solver='liblinear'))
    ]
)

In [20]:
gs_lr_tfidf = GridSearchCV(
    estimator=lr_tfidf, 
    param_grid=param_grid,
    scoring=scorer,
    cv=7,
    verbose=2,
    n_jobs=-1
)

In [21]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 7 folds for each of 192 candidates, totalling 1344 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 56.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 87.9min
[Parallel(n_jobs=-1)]: Done 1344 out of 1344 | elapsed: 120.2min finished


GridSearchCV(cv=7, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [22]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 100.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 2), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f948660c710>} 
CV Accuracy: 0.985


In [48]:
clf = gs_lr_tfidf.best_estimator_
print('F1 Score: %.3f' % clf.score(X_test, y_test))

F1 Score: 0.988


In [49]:
gs_lr_tfidf.best_params_

{'clf__C': 100.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer(sentence)>}

In [24]:
param_grid = [
    {'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
     'vect__tokenizer': [tokenizer, tokenizer_lemma],
     'vect__stop_words': [stop, None],
     'svc__C': param_range,
     'svc__kernel': ['linear']
    },
    {'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
     'vect__tokenizer': [tokenizer, tokenizer_lemma],
     'vect__stop_words': [stop, None],
     'svc__gamma': param_range,
     'svc__C': param_range,
     'svc__kernel': ['rbf']
    },
]


svc_tfidf = Pipeline([
    ("vect", tfidf), 
    ("svc", SVC(random_state=1))
])

In [25]:
# Construct our grid search object
gs_svc_tfidf = GridSearchCV(
    estimator=svc_tfidf, 
    param_grid=param_grid,
    scoring=scorer,
    cv=7, 
    verbose=2, 
    n_jobs=-1
)

In [26]:
gs_svc_tfidf.fit(X_train, y_train)

Fitting 7 folds for each of 336 candidates, totalling 2352 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 52.3min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 92.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 207.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 352.7min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 529.3min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 687.0min
[Parallel(n_jobs=-1)]: Done 2352 out of 2352 | elapsed: 788.0min finished


GridSearchCV(cv=7, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [27]:
print('Best parameter set: %s ' % gs_svc_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_svc_tfidf.best_score_)

Best parameter set: {'svc__C': 1.0, 'svc__gamma': 10.0, 'svc__kernel': 'rbf', 'vect__ngram_range': (1, 2), 'vect__stop_words': ['nine', 'whole', 'of', 'whatever', 'often', 'sixty', 'bottom', 'our', 'being', 'with', 'a', 'no', 'regarding', 'my', 'its', 'none', 'always', 'on', 'over', 'why', 'may', 'against', 'rather', 'less', 'another', 'enough', 'everyone', 'very', 'few', 'either', 'below', 'now', 'where', 'does', 'down', 'something', 'already', 'next', 'five', 'under', 'neither', 'one', 'via', 'will', 'you', 'six', 'otherwise', 'else', 'behind', 'somehow', 'alone', 'once', 'us', 'they', 'noone', 'beside', 'has', 'all', 'other', 'perhaps', 'unless', 'might', 'whence', 'me', 'herself', 'done', "'ll", 'onto', 'had', 'keep', 'same', 'seeming', 'take', 'three', 'amount', 'latter', 'or', 'n’t', 'became', 'while', 'any', 'everywhere', '‘m', 'more', 'made', 'are', 'mine', 'whether', '’m', 'amongst', 'nobody', 'hers', 'out', 'her', 'he', 'about', 'hereby', 'if', 'please', 'first', 'themselves'

In [50]:
svc = gs_svc_tfidf.best_estimator_
print('F1 Score: %.3f' % svc.score(X_test, y_test))

F1 Score: 1.000


# Test Code

#### Bag Of Words

In [29]:
about_text = ('Gus Proto was a Python developer while ago. But now going to be a rockstars')

In [30]:
about_doc = nlp(about_text)

In [31]:
list(about_doc.sents)

[Gus Proto was a Python developer while ago., But now going to be a rockstars]

In [32]:
def tokenizer(sentence):
    doc = nlp(sentence)
    return [token.lemma_ for token in doc]

In [33]:
tokenizer("A letter has been written, asking him to be released!")

['a',
 'letter',
 'have',
 'be',
 'write',
 ',',
 'ask',
 '-PRON-',
 'to',
 'be',
 'release',
 '!']