<h1><center>Netflix Data: Modeling</center></h1>


In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
netflix = pd.read_csv('../data/to-model_desc_cln-netflix')
netflix.head(2)

Unnamed: 0.1,Unnamed: 0,type,title,listed_in,description,description_cln
0,0,0,Norm of the North: King Sized Adventure,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,planning awesome wedding grandfather polar bea...
1,1,0,Jandino: Whatever it Takes,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,jandino asporaat riff challenge raising kid se...


## Prep for Model

In [76]:
#split X and y 
X = netflix['description_cln']
y = netflix['type']

In [10]:
y.value_counts(normalize = True)

0    0.683746
1    0.316254
Name: type, dtype: float64

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

## CountVectorizer

In [12]:
#Instantiate 
#cvec = CountVectorizer()

In [74]:
#fit cvec on corpus
#cvec.fit(X_train)

In [14]:
#transforming corpus
#X_train = cvec.transform(X_train)

In [15]:

# Convert X_train into a DataFrame.

# X_train_df = pd.DataFrame(X_train.toarray(),
#                           columns=cvec.get_feature_names())
# X_train_df

Unnamed: 0,aamir,aaron,abad,abagnale,abandon,abandoned,abandoning,abandonment,abargil,abbey,...,zoe,zombie,zone,zordon,zorro,zoya,zuhu,zulu,zumbo,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4664,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4665,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4666,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# X_test = cvec.transform(X_test)
# X_test_df = pd.DataFrame(X_test.toarray(),
#                          columns=cvec.get_feature_names())

# X_test_df.head()

Unnamed: 0,aamir,aaron,abad,abagnale,abandon,abandoned,abandoning,abandonment,abargil,abbey,...,zoe,zombie,zone,zordon,zorro,zoya,zuhu,zulu,zumbo,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# print(CountVectorizer(stop_words = 'english').get_stop_words())

frozenset({'above', 'against', 'only', 'someone', 'former', 'twelve', 'serious', 'thence', 'therefore', 'whereupon', 'here', 'yourselves', 'detail', 'anything', 'four', 'him', 'mostly', 'beforehand', 'too', 'every', 'eleven', 'on', 'whenever', 'around', 'cant', 'ltd', 'hereby', 'co', 'beside', 'these', 'if', 'have', 'further', 'since', 'done', 'somehow', 'those', 'off', 'whereafter', 'across', 'perhaps', 'hers', 'sometime', 'which', 'almost', 'few', 'five', 'neither', 'yourself', 'while', 'up', 'what', 'interest', 'be', 'also', 'seeming', 'first', 'become', 'amount', 'give', 'well', 'everything', 'is', 'because', 'sincere', 'own', 'could', 'again', 'toward', 'through', 'eg', 'whence', 'else', 'themselves', 'fill', 'anyone', 'whom', 'third', 're', 'thereby', 'hereafter', 'others', 'found', 'a', 'several', 'after', 'from', 'less', 'one', 'before', 'con', 'will', 'thereupon', 'becoming', 'everyone', 'both', 'throughout', 'this', 'to', 'for', 'find', 'there', 'made', 'thick', 'forty', 'wha

In [None]:
#cvec = CountVectorizer(stop_words=['list', 'of', 'words', 'to', 'stop'])


##  Logit and cvec

In [88]:
# ctf = make_column_transformer(
#     (CountVectorizer(), ['description_cln']),
#     remainder='passthrough'
# )

In [102]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [103]:
params = {
      'cvec__ngram_range' : [(1, 1), (1,2), (1, 3)],
      'cvec__max_features': [500, 1000],
      'cvec__min_df' : [2, 3],
      'cvec__stop_words': ['english', None]
}

In [104]:
gs = GridSearchCV(pipe,
                 params, 
                 cv = 5)

In [105]:
gs.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'cvec__max_features': [500, 1000], 'cvec__min_df': [2, 3], 'cvec__stop_words': ['english', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [106]:
gs.best_score_

0.7192118226600985

In [107]:
gs_model = gs.best_estimator_

In [108]:
gs_model.score(X_train, y_train)


0.7800385521524952

In [109]:
gs_model.score(X_test, y_test)


0.7315350032113038

In [110]:
gs.best_params_


{'cvec__max_features': 500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None}

## CountVec & MultinomialNB

In [137]:
pipe2 = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB())
])


In [141]:
params2 = {
    'cvec__ngram_range' : [(1, 1), (1,2), (1, 3)],
    'cvec__max_features': [500, 1000],
    'cvec__min_df' : [2, 3],
    'cvec__stop_words': ['english', None],
    'nb__alpha' : [.5, 1, 5]
      
}

In [142]:
gs2 = GridSearchCV(estimator=pipe2, 
                  param_grid=params2,  
                  cv = 5)

In [143]:
gs2.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'cvec__max_features': [500, 1000], 'cvec__min_df': [2, 3], 'cvec__stop_words': ['english', None], 'nb__alpha': [0.5, 1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [144]:
gs2.best_score_

0.7254230027843221

In [145]:
gs2_model = gs2.best_estimator_


In [146]:
gs2_model.score(X_train, y_train)


0.786463910901692

In [147]:
gs2_model.score(X_test, y_test)


0.7257546563904945

In [148]:
gs2.best_params_


{'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'nb__alpha': 5}