In [2]:
import praw   # Python Reddit API Wrapper
import pandas as pd
import datetime as dt
import time

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

from sklearn.metrics import confusion_matrix

In [18]:
positive_df = pd.read_csv('data_pos_class.csv')
positive_df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,category,class,timestamp
0,"6 Months, 150,000 Pieces, 200 LEDs Later... My...",1261,b31lgu,https://i.redd.it/18yfvmxul4n21.jpg,46,1553023000.0,,top_subreddit,1,2019-03-19 14:20:28
1,Sorry if it’s been done before but i thought i...,790,aqb5c7,https://i.redd.it/6fqwr1q4ceg21.jpg,40,1550090000.0,,top_subreddit,1,2019-02-13 14:34:38
2,Star Wars Battlefront 2 Vardos Tower WIP. Me f...,763,axcpto,https://i.redd.it/ryvf1n4786k21.jpg,44,1551735000.0,,top_subreddit,1,2019-03-04 15:34:59
3,Current version of all my Box MOCs!,764,asz3n8,https://i.redd.it/lyhhr4x2uuh21.jpg,47,1550726000.0,,top_subreddit,1,2019-02-20 23:07:54
4,I hope you guys can appreciate my venator MOC,718,az818i,https://i.redd.it/46z1vsuiq5l21.jpg,27,1552165000.0,,top_subreddit,1,2019-03-09 14:59:58


In [19]:
negative_df = pd.read_csv('data_neg_class.csv')
negative_df['class'] = negative_df['class'].map(lambda x:0)
negative_df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,category,class,timestamp
0,Guardians of the Front Page,283477,5gn8ru,http://i.imgur.com/OOFRJvr.gifv,5024,1480960000.0,,top_subreddit,0,2016-12-05 11:41:14
1,"Thanks, Obama.",230830,5bx4bx,https://i.reddituploads.com/58986555f545487c9d...,6116,1478651000.0,,top_subreddit,0,2016-11-08 18:27:25
2,"I am Barack Obama, President of the United Sta...",216141,z1c9z,https://www.reddit.com/r/IAmA/comments/z1c9z/i...,23255,1346270000.0,"Hi, I’m Barack Obama, President of the United ...",top_subreddit,0,2012-08-29 15:01:36
3,"This is Shelia Fredrick, a flight attendant. S...",222814,5sfexx,https://i.reddituploads.com/d1e77b5c62694624ba...,4370,1486401000.0,,top_subreddit,0,2017-02-06 11:06:40
4,1 dad reflex 2 children,204183,5jrlw1,http://i.imgur.com/Rum0zSz.gifv,5674,1482426000.0,,top_subreddit,0,2016-12-22 10:57:35


In [20]:
df = pd.concat([positive_df, negative_df], axis=0)

In [21]:
df.shape

(13081, 10)

In [22]:
X = df['title']
y = df['class']

In [23]:
X.head()

0    6 Months, 150,000 Pieces, 200 LEDs Later... My...
1    Sorry if it’s been done before but i thought i...
2    Star Wars Battlefront 2 Vardos Tower WIP. Me f...
3                  Current version of all my Box MOCs!
4        I hope you guys can appreciate my venator MOC
Name: title, dtype: object

In [24]:
# our classes are balanced
y.value_counts()

0    6541
1    6540
Name: class, dtype: int64

In [25]:
# the baseline accuracy we desire is 
max(y.value_counts(normalize=True))

0.5000382233774177

In [26]:
# train/test split (before doing any transformations or cleaning of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=y)

In [27]:
X_train.shape

(9810,)

In [46]:
# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

In [49]:
start_time = time.time()

ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada_params = {
    'n_estimators': [100],
    'base_estimator__max_depth': [2],
    'learning_rate': [1.]
}
gs_ada = GridSearchCV(ada, param_grid=ada_params, cv=5)
gs_ada.fit(df_train, y_train)
print(gs_ada.best_score_)
print(gs_ada.best_params_)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')



KeyboardInterrupt: 

In [None]:
gs_ada.score(df_train, y_train)

In [43]:
gs_ada.score(df_test, y_test)

0.8122898196270254

In [44]:
start_time = time.time()

ada1 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
ada1_params = {
    'base_estimator__max_depth': [None, 2],
    'n_estimators': [25, 50],
    'learning_rate': [.7, .8, 1.0]
}
gs_ada1 = GridSearchCV(ada1, param_grid=ada1_params, cv=5)
gs_ada1.fit(df_train, y_train)
print(gs_ada1.best_score_)
print(gs_ada1.best_params_)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')


KeyboardInterrupt: 

In [None]:
gs_ada1.score(df_test, y_test)

In [None]:
start_time = time.time()

ada_lr = AdaBoostClassifier(base_estimator=LogisticRegression())
ada_lr_params = {
    'n_estimators': [50,100],
    'base_estimator__max_depth': [1,2],
    'learning_rate': [.9, 1.]
}
gs_ada_lr = GridSearchCV(ada, param_grid=ada_lr_params, cv=3)
gs_ada_lr.fit(df_train, y_train)
print(gs_ada_lr.best_score_)
print(gs_ada_lr.best_params_)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')



In [None]:
gs_ada_lr.score(df_test, y_test)

In [None]:
start_time = time.time()

lr = LogisticRegression()
lr_params = {
    
}

gs_lr = GridSearchCV(lr, param_grid=lr_params, cv=5)
gs_lr.fit(df_train, y_train)
print(gs_lr.best_score_)
print(gs_lr.best_params_)


end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

Unnamed: 0,00,000,001,10,100,1000,105,11,12,120,...,zealand,zebra,zelda,zepher,zero,zion,zip,zombie,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
start_time = time.time()

knn_params = {
    'n_neighbors': [1,3,4,5,15,21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
gs_knn = GridSearchCV(
    knn, 
    knn_params, 
    #scoring = scorer, 
    verbose = 1,
    cv=3
)
gs_knn.fit(df_train, y_train)
print(gs_knn.best_score_)
print(gs_knn.best_params_)


end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')



In [58]:
start_time = time.time()

rf_params = {
    'n_estimators': [10, 20, 30, 50, 100],
    'max_depth': [None, 2, 3, 5, 10],
    'max_features': [2]#'auto', 2, 3, 4, 5, 6]
}

rf = RandomForestClassifier()
gs_rf = GridSearchCV(
    rf, 
    param_grid=rf_params, 
    #scoring = scorer, 
    verbose = 1
)
gs_rf.fit(df_train, y_train)
print(gs_rf.best_score_)
print(gs_rf.best_params_)


end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  3.3min finished


0.800407747196738
{'max_depth': None, 'max_features': 2, 'n_estimators': 100}
time: 237.061 seconds
time: 3 minutes, 57.061 seconds


In [59]:
gs_rf.score(df_train, y_train)

0.980428134556575

In [60]:
gs_rf.score(df_test, y_test)

0.8208498929990828

In [63]:
# We are using a bagging algorithm because our model is high variance.

# rf_params = {
#     'base_estimator__n_estimators': [100],
#     'base_estimator__max_depth': [None],
#     'base_estimator__max_features': [2]
# }

bag = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100, max_depth=None, max_features=2), 
                        n_estimators=2)
bag.fit(df_train, y_train)
print(bag.score(df_train, y_train))
bag.score(df_test, y_test)

0.9389398572884812


0.81381840415775

In [64]:
# We are using a bagging algorithm because our model is high variance.

# rf_params = {
#     'base_estimator__n_estimators': [100],
#     'base_estimator__max_depth': [None],
#     'base_estimator__max_features': [2]
# }

bag = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100, max_depth=None, max_features=2), 
                        n_estimators=4)
bag.fit(df_train, y_train)
print(bag.score(df_train, y_train))
bag.score(df_test, y_test)

0.9559633027522936


0.8287985325588505

In [66]:
start_time = time.time()

# We are using a bagging algorithm because our model is high variance.

bag_params = {
    'n_estimators': [1],
    'max_depth': [1],
    'base_estimator': [None, RandomForestClassifier(n_estimators=100, max_depth=None, max_features=2)]
}

bag = BaggingClassifier()

gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(gs_bag.score(df_train, y_train))
print(gs_bag.score(df_test, y_test))

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.6min finished


0.9229357798165138
0.8132069703454601
time: 123.209 seconds
time: 2 minutes, 3.209 seconds


In [70]:
gs_bag.best_estimator_

BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=1, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [71]:
gs_bag.best_params_

{'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features=2, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False), 'n_estimators': 1}

In [73]:
start_time = time.time()

# We are using a bagging algorithm because our model is high variance.

bag_params = {
    'base_estimator__n_estimators': [100],
    'base_estimator__max_depth': [2, None],
    'base_estimator__max_features': [2]
#     'base_estimator': [None, RandomForestClassifier(n_estimators=100, max_depth=None, max_features=2)]
}

bag = BaggingClassifier(RandomForestClassifier())

gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(gs_bag.score(df_train, y_train))
print(gs_bag.score(df_test, y_test))

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.7min finished


0.9668705402650357
0.8318557016202996
time: 793.912 seconds
time: 13 minutes, 13.912 seconds


4/4/2019

In [27]:
import praw   # Python Reddit API Wrapper
import pandas as pd
import datetime as dt
import time

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 3000

Getting the data

In [28]:
# thank you for your function, Heather

def metrics(y_test, y_predict):
    print('Accuracy score %s ' % accuracy_score(y_test, y_predict), '\n')
    print('----------------------------------------------------------------')
    print(pd.DataFrame(confusion_matrix(y_test, y_predict), 
                            index=['Actually_Negative', 'Actually_Positive'], 
                            columns=['Predicted_Negative', 'Predicted_Positive']), '\n')
    print('-----------------------------------------------------------------')
    print(classification_report(y_test, y_predict))
    print('-----------------------------------------------------------------')

In [29]:
positive_df = pd.read_csv('data_pos_class.csv')
# positive_df.head()

negative_df = pd.read_csv('data_neg_class.csv')
negative_df['class'] = negative_df['class'].map(lambda x:0)
# negative_df.head()

df = pd.concat([positive_df, negative_df], axis=0)
df.shape

X = df['title']
y = df['class']

In [33]:
# This train test split uses a much smaller testing set.
# train/test split (before doing any transformations or cleaning of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 42,
                                                    stratify = y,
                                                    test_size = 0.20)

In [34]:
X_train.shape

(10464,)

In [67]:
# ngrams = 1 or 2
# We use less max features, to see if we cannot reduce some of the variance
# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=4000, ngram_range=(1,2), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

In [36]:
start_time = time.time()

# We are using a bagging algorithm because our model is high variance.

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator__n_estimators': [100],
    'base_estimator__max_depth': [2, None],
    'base_estimator__max_features': [2]
#     'base_estimator': [None, RandomForestClassifier(n_estimators=100, max_depth=None, max_features=2)]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier(RandomForestClassifier())

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.6min finished


The training score: 0.9625382262996942
The testing score: 0.8318685517768437
Generating predictions
time: 802.774 seconds
time: 13 minutes, 22.774 seconds
Accuracy score 0.8318685517768437  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1192                 117
Actually_Positive                 323                 985 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.91      0.84      1309
           1       0.89      0.75      0.82      1308

   micro avg       0.83      0.83      0.83      2617
   macro avg       0.84      0.83      0.83      2617
weighted avg       0.84      0.83      0.83      2617

-----------------------------------------------------------------


In [96]:
X = df['title']
y = df['class']

In [97]:
# This train test split uses a much smaller testing set.
# train/test split (before doing any transformations or cleaning of the data)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 42,
                                                    stratify = y,
                                                    test_size = 0.20)

In [98]:
nb = MultinomialNB() 
model = nb.fit(df_train, y_train)


In [99]:
predictions = nb.predict(df_test)

In [100]:
nb.score(df_train, y_train)

0.8800649847094801

In [101]:
nb.score(df_test, y_test)


0.8429499426824608

In [102]:
metrics(y_test, predictions)

Accuracy score 0.8429499426824608  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1173                 136
Actually_Positive                 275                1033 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.90      0.85      1309
           1       0.88      0.79      0.83      1308

   micro avg       0.84      0.84      0.84      2617
   macro avg       0.85      0.84      0.84      2617
weighted avg       0.85      0.84      0.84      2617

-----------------------------------------------------------------


In [103]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 1173
False Positives: 136
False Negatives: 275
True Positives: 1033


In [63]:
for i in range(200):
    print(df_train.columns[200 + i])


alliance
allow
allowance
allowed
alpha
alright
alternate
alternate build
alternative
alternative build
ama
amazing
amazing lego
amazon
america
american
americans
anakin
ancient
android
angle
angry
animal
animals
animated
animation
anniversary
anniversary sets
anniversary slave
announced
announcement
announces
answer
anti
anybody
anybody know
anybody remember
anymore
ap
apart
apartment
apocalypse
apollo
apologize
app
apparently
appear
apple
appreciate
appreciated
appropriate
apr
april
april 2019
april fools
arab
arabia
arc
arcade
architecture
area
aren
arizona
arm
armed
armor
arms
army
arrest
arrested
arrived
arrived today
art
article
artist
artists
artwork
ask
asked
asking
asks
asleep
ass
assault
assault walker
asshole
astromech
ate
athene
athletes
atm
attached
attack
attacked
attempt
attempting
attention
attic
audience
august
aunt
australia
australians
autism
autumn
availability
available
avengers
avengers infinity
average
awaiting
awakens
awakens review
award
awards
awareness
away
aw

<font color = blue>
    ngrams = 1, 2, or 3

In [120]:
# This train test split uses a much smaller testing set.
# train/test split (before doing any transformations or cleaning of the data)
X3_train, X3_test, y3_train, y3_test = train_test_split(X, y,
                                                    random_state = 17,
                                                    stratify = y,
                                                    test_size = 0.20)

In [121]:
X3_train.shape

(10464,)

In [122]:
# ngrams = 1, 2, or3
# We use less max features, to see if we cannot reduce some of the variance
# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec3 = CountVectorizer(max_features=4000, ngram_range=(1,3), stop_words='english')

# training dataframe
df3_train = pd.DataFrame(cvec3.fit_transform(X3_train).toarray(),
                        columns=cvec3.get_feature_names())

# testing dataframe
df3_test = pd.DataFrame(cvec3.transform(X3_test).toarray(),
                      columns=cvec3.get_feature_names())

In [127]:
nb3 = MultinomialNB() 
model3 = nb3.fit(df3_train, y3_train)


In [128]:
predictions3 = nb3.predict(df3_test)

In [129]:
nb3.score(df3_train, y3_train)

0.8798738532110092

In [130]:
nb3.score(df3_test, y3_test)

0.853649216660298

In [131]:
# checking various random states (20 gives the closest scores between the training and testing sets)

for i in range(25):

    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y,
                                                    random_state = i,
                                                    stratify = y,
                                                    test_size = 0.20)
    cvec3 = CountVectorizer(max_features=4000, ngram_range=(1,3), stop_words='english')

    # training dataframe
    df3_train = pd.DataFrame(cvec3.fit_transform(X3_train).toarray(),
                            columns=cvec3.get_feature_names())

    # testing dataframe
    df3_test = pd.DataFrame(cvec3.transform(X3_test).toarray(),
                          columns=cvec3.get_feature_names())
    
    nb3 = MultinomialNB() 
    model3 = nb3.fit(df3_train, y3_train)
    
    print(f'Random_state: {i}')
    print(f'Training accuracy: {nb3.score(df3_train, y3_train)}')
    print(f'Testing accuracy: {nb3.score(df3_test, y3_test)}')
    print()

Random_state: 0
Training accuracy: 0.8798738532110092
Testing accuracy: 0.834543370271303

Random_state: 1
Training accuracy: 0.8778669724770642
Testing accuracy: 0.8479174627435996

Random_state: 2
Training accuracy: 0.8791093272171254
Testing accuracy: 0.8528849828047382

Random_state: 3
Training accuracy: 0.8812117737003058
Testing accuracy: 0.8353076041268628

Random_state: 4
Training accuracy: 0.880447247706422
Testing accuracy: 0.8418035918991211

Random_state: 5
Training accuracy: 0.8776758409785933
Testing accuracy: 0.8356897210546427

Random_state: 6
Training accuracy: 0.8805428134556575
Testing accuracy: 0.8414214749713412

Random_state: 7
Training accuracy: 0.8817851681957186
Testing accuracy: 0.8234619793656859

Random_state: 8
Training accuracy: 0.8837920489296636
Testing accuracy: 0.8269010317157051

Random_state: 9
Training accuracy: 0.8774847094801224
Testing accuracy: 0.838746656476882

Random_state: 10
Training accuracy: 0.8807339449541285
Testing accuracy: 0.83225066

In [132]:
# checking various random states (28 gives the closest scores between the training and testing sets)

for i in range(25, 50):

    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y,
                                                    random_state = i,
                                                    stratify = y,
                                                    test_size = 0.20)
    cvec3 = CountVectorizer(max_features=4000, ngram_range=(1,3), stop_words='english')

    # training dataframe
    df3_train = pd.DataFrame(cvec3.fit_transform(X3_train).toarray(),
                            columns=cvec3.get_feature_names())

    # testing dataframe
    df3_test = pd.DataFrame(cvec3.transform(X3_test).toarray(),
                          columns=cvec3.get_feature_names())
    
    nb3 = MultinomialNB() 
    model3 = nb3.fit(df3_train, y3_train)
    
    print(f'Random_state: {i}')
    print(f'Training accuracy: {nb3.score(df3_train, y3_train)}')
    print(f'Testing accuracy: {nb3.score(df3_test, y3_test)}')
    print()

Random_state: 25
Training accuracy: 0.878822629969419
Testing accuracy: 0.8482995796713795

Random_state: 26
Training accuracy: 0.8793960244648318
Testing accuracy: 0.834543370271303

Random_state: 27
Training accuracy: 0.8805428134556575
Testing accuracy: 0.8314864348490638

Random_state: 28
Training accuracy: 0.8766246177370031
Testing accuracy: 0.8593809705769966

Random_state: 29
Training accuracy: 0.8814984709480123
Testing accuracy: 0.8391287734046619

Random_state: 30
Training accuracy: 0.8831230886850153
Testing accuracy: 0.8398930072602216

Random_state: 31
Training accuracy: 0.8800649847094801
Testing accuracy: 0.8418035918991211

Random_state: 32
Training accuracy: 0.8783448012232415
Testing accuracy: 0.8494459304547192

Random_state: 33
Training accuracy: 0.8771980122324159
Testing accuracy: 0.8333970194879633

Random_state: 34
Training accuracy: 0.8812117737003058
Testing accuracy: 0.8318685517768437

Random_state: 35
Training accuracy: 0.8778669724770642
Testing accuracy:

In [139]:
# checking various random states (28 gives the closest scores between the training and testing sets)
start_time = time.time()


for i in range(0, 1000):
    if i % 20 == 0:
        print(f'Step {i} out of {len(range(0, 1000))}')
        print()
    
    X3_train, X3_test, y3_train, y3_test = train_test_split(X, y,
                                                    random_state = i,
                                                    stratify = y,
                                                    test_size = 0.20)
    cvec3 = CountVectorizer(max_features=4000, ngram_range=(1,3), stop_words='english')

    # training dataframe
    df3_train = pd.DataFrame(cvec3.fit_transform(X3_train).toarray(),
                            columns=cvec3.get_feature_names())

    # testing dataframe
    df3_test = pd.DataFrame(cvec3.transform(X3_test).toarray(),
                          columns=cvec3.get_feature_names())
    
    nb3 = MultinomialNB() 
    model3 = nb3.fit(df3_train, y3_train)
    
    if nb3.score(df3_test, y3_test) > 0.85:
        
        print(f'Random_state: {i}')
        print(f'Training accuracy: {nb3.score(df3_train, y3_train)}')
        print(f'Testing accuracy: {nb3.score(df3_test, y3_test)}')
        print()
    
end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)  

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')


Step 0 out of 1000

Random_state: 2
Training accuracy: 0.8791093272171254
Testing accuracy: 0.8528849828047382

Random_state: 17
Training accuracy: 0.8798738532110092
Testing accuracy: 0.853649216660298

Step 20 out of 1000

Random_state: 20
Training accuracy: 0.8784403669724771
Testing accuracy: 0.8540313335880779

Random_state: 21
Training accuracy: 0.8751911314984709
Testing accuracy: 0.8505922812380589

Random_state: 22
Training accuracy: 0.8759556574923547
Testing accuracy: 0.8521207489491784

Random_state: 28
Training accuracy: 0.8766246177370031
Testing accuracy: 0.8593809705769966

Step 40 out of 1000

Random_state: 53
Training accuracy: 0.8778669724770642
Testing accuracy: 0.850210164310279

Step 60 out of 1000

Step 80 out of 1000

Random_state: 85
Training accuracy: 0.8779625382262997
Testing accuracy: 0.8513565150936186

Random_state: 96
Training accuracy: 0.8773891437308868
Testing accuracy: 0.8517386320213985

Step 100 out of 1000

Step 120 out of 1000

Random_state: 124


In [165]:
# AdaBoost to try and eliminate some of the bias within the Bayes model

start_time = time.time()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 26,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=4000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())


ada = AdaBoostClassifier()  # we are not limited to only decision trees with this model 'base_estimator' max_depth=1 is high bias
                            # we are not limited to the number of 

ada_params = {
    'base_estimator': [MultinomialNB(), RandomForestClassifier(), None],
    'n_estimators'  : [5],
    'learning_rate' : [1.0]
}

gs = GridSearchCV(ada, 
                  param_grid=ada_params, 
                  verbose = 1,
                  cv=5)
gs.fit(df_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

print(f'The training score: {gs.score(df_train, y_train)}')
print(f'The testing score: {gs.score(df_test, y_test)}')

preds = gs.predict(df_test)

metrics(y_test, preds)

tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')




0.7799120795107034
{'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'learning_rate': 1.0, 'n_estimators': 5}
The training score: 0.9765863914373089
The testing score: 0.7905999235766145
Accuracy score 0.7905999235766145  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1036                 273
Actually_Positive                 275                1033 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0    

In [1]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=4000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB()],
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs.best_params_}')
print(f'The best score from the grid seacrh: {gs.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


NameError: name 'time' is not defined

In [176]:
df_train.shape

(10464, 4000)

In [177]:
df_train.head()

Unnamed: 0,00,000,000 pieces,10,10 year,10 year old,10 years,100,100 000,1000,105,11,12,12 year,12 year old,120,13,14,15,150,16,16 years,17,18,19,1967,1980s,1989,1990,1996,1998,1999,1st,20,20 year,20 years,200,2000,2003,2005,2006,2008,2009,2010,2012,2013,2014,2015,2016,2017,2018,2018 lego,2019,2020,20th,20th anniversary,20th anniversary sets,21,22,23,24,25,250,26,27,28,2nd,30,300,32,34,35,3d,40,400,41,45,4th,50,5000,501st,55,59,5th,60,64,66,70,75,75105,75131,75155,75179,75181,75189,75190,75191,75192,75211,75222,76,80,800,80s,90,90s,90s kids,911,99,able,absolute,absolutely,access,accident,accidentally,according,account,accounts,accurate,accused,achieved,achievement,act,act scene,action,actor,actual,actually,ad,adam,adam west,add,added,addiction,adding,addition,addition collection,additions,admiral,adorable,adult,advanced,advent,advent calendar,adventures,advertising,advice,afford,afraid,african,age,ages,ago,agree,ahch,ahead,aid,ain,air,airlines,airplane,airport,al,alarm,alaska,album,album comments,alcohol,alderaan,alderaan playset,ali,alien,alive,alley,alliance,allow,allowance,allowed,alpha,alright,alternate,alternate build,alternative,ama,amazing,amazing lego,amazon,america,america 2016,american,americans,amigos,anakin,ancient,android,angeles,anger,angle,angry,animal,animals,animated,animation,anniversary,anniversary sets,anniversary slave,announce,announced,announcement,announces,anon,answer,anti,ants,anybody,anybody know,anybody remember,anymore,anyways,ap,apart,apartment,apocalypse,apocalyptic,apollo,apologize,app,apparently,appeal,appear,apple,appreciate,appreciated,approaches,appropriate,apr,apr 2019,april,april 2019,april fools,aquarium,arab,arabia,arc,arc 170,arcade,arcade machines,architect,architecture,area,aren,arizona,arm,armed,armor,arms,army,arrest,arrested,arrived,arrived today,art,article,artist,ask,asked,asking,asks,asleep,ass,assault,asshole,astromech,attack,attempt,attention,august,aunt,aurora,austin,australia,australian,autism,automatic,autumn,availability,available,avengers,avengers endgame,avengers infinity,avengers infinity war,average,awakens,awakens review,award,awards,awareness,away,awesome,awesome lego,awhile,aww,babies,baby,background,backlog,backwards,backyard,bacon,bad,bad ass,bad boy,bad quality,badass,bag,bags,bakery,balcony,ball,balls,ban,banana,banana scale,band,bang,bank,banksy,banned,bans,bar,barbie,barc,barc speeder,barcelona,bardstown,bardstown ky,barnes,barry,bars,base,base moc,baseball,based,basement,basically,basketball,bass,bat,batcave,bathroom,batman,batman movie,batmobile,battle,battle droid,battle pack,battle packs,battlefront,battlefront moc,battlepack,bay,bb,beach,bean,bear,beard,beast,beatles,beats,beautiful,beauty,bed,beer,beetle,begin,beginning,begins,behold,believe,bell,belonged,belongs,ben,bernie,bernie sanders,bespin,best,best friend,best friends,best lego,best picture,best place,best thing,best war,best war stream,best way,betrayal,betrayal cloud,betrayal cloud city,better,beware,bf,bible,bieber,big,big fan,big lego,bigger,biggest,bike,bike moc,bikes,biking,billion,billund,bin,bird,birds,birth,birthday,bit,bitch,black,black friday,black lion,blacktron,blade,blanket,bleeding,blind,block,blocks,blog,blood,blow,blue,board,boat,bob,boba,boba fett,boca,body,boi,bold,bomber,bonus,boobs,book,booked,books,boom,boost,boots,borderlands,borderlands box,borderlands box art,bored,boring,born,boss,boston,bot,bothered,bots,bottle,bought,bought new,bought set,bounty,bounty hunter,bounty hunters,bowl,box,boxes,boy,boyfriend,boys,brain,brand,brand new,brazil,bread,break,breakfast,breaking,breaking bad,brick,brickfilm,brickhead,brickheadz,bricklink,bricks,bricks lego,brickvault,bridge,bring,brings,british,bro,...,think ve,thinking,thinks,tho,thomas,thor,thorin,thought,thought cool,thought guys,thought like,thought post,thought share,thoughts,thoughts lego,thousand,thousands,thrawn,thread,threw,thrift,thrift store,thriller,throne,throne room,throw,throwing,tie,tie fighter,tie fighters,tie interceptor,ties,tifu,til,tiles,till,tim,tim goddard,time,time lapse,time lapse build,time picture,time posting,timelapse,timelapse build,times,tiny,tip,tips,tired,titanfall,titanic,title,tlj,today,today haul,todays,toilet,tokyo,told,tom,tomorrow,ton,tonight,tony,took,took ago,took days,took lot,took lot time,took phone,took photo,took picture,tool,tooth,torn,toronto,torso,total,totally,totally worth,touch,tough,tour,tournament,tower,towers,town,toy,toy store,toys,track,tractor,trade,traded,trading,traditional,traffic,trailer,train,training,trandish,trans,transformation,transformers,transforming,transport,trap,trapped,trash,travel,treat,treated,tree,tree planting,trees,trench,trench run,trend,tribute,tried,tried make,tries,trigger,trilogy,trip,triple,tron,troop,troop transport,trooper,trooper battle,trooper battle pack,troopers,troopers meet,troops,trouble,troy,tru,truck,true,true love,true story,truly,trump,trunk,trust,truth,try,try make,trying,tsm,tuesday,turkey,turn,turned,turning,turns,turret,turtle,tv,tweet,twitter,type,types,ucs,ucs falcon,ucs millenium,ucs millenium falcon,ucs millennium,ucs millennium falcon,ucs set,ucs sets,ucs slave,ucs wing,ugly,uk,ult,ultimate,ultimate collector,ultimate collector series,ultra,ultron,unboxing,uncle,understand,underwater,unfinished,unikitty,unique,unit,united,united states,universe,university,unknown,unless,unlimited,unlock,unlocked,unopened,unpopular,unpopular opinion,unsure,upcoming,update,updated,upgrade,upgraded,upper,upvote,upvotes,usa,usd,use,used,useless,user,users,uses,using,usual,usually,utah,v2,vacation,vader,vader bust,vader castle,vaders,valentine,valentine day,valley,valuable,value,valve,van,vancouver,ve,ve bought,ve got,ve heard,ve just,ve seen,ve taken,ve working,vegas,vehicle,vehicles,venator,venator moc,venom,version,vet,vhs,video,video game,video lego,videos,vietnam,view,views,village,vintage,violence,vip,vip points,virginia,virginia living,virginia living museum,virus,vision,visit,visited,visiting,voice,vol,voltron,volume,vonreg,vote,voted,voter,votes,voting,vs,vs new,vs tsm,vulture,wa,wait,wait build,waiting,waiting line,wake,wal,wal mart,walk,walked,walker,walkers,walking,walks,wall,wall trees,wallet,wallpaper,walls,walmart,wan,wan kenobi,wanna,want,want build,want make,wanted,wanted know,wanted share,wanting,wants,wants photo,war,war stream,war stream mega,ward,warfare,warning,warrior,wars,wars 2017,wars 20th,wars 20th anniversary,wars advent,wars advent calendar,wars best,wars best war,wars collection,wars complete,wars complete saga,wars darth,wars echo,wars echo squadron,wars episode,wars fans,wars force,wars force awakens,wars game,wars imperial,wars jedi,wars lego,wars lego collection,wars legos,wars rebels,wars resistance,wars set,wars sets,wars story,wars ucs,wars video,wash,washing,washington,wasn,watch,watched,watching,water,wave,waves,way,way home,wayne,wayne manor,ways,weapon,weapons,wear,wearing,weather,web,website,websites,wedding,wednesday,week,weekend,weeks,weeks ago,weight,weird,welcome,welp,went,werewolf,west,wet,whale,whales,wheel,wheelchair,wheels,whisky,whisky distillery,whisky distillery bardstown,white,white house,white whale,wide,wife,wife got,wifi,wild,willie,wily,win,wind,window,windows,wing,wing moc,wing starfighter,wings,wins,winter,wip,wish,wolves,woman,women,women nasa,won,wonder,wonderful,wondering,wood,wooden,word,words,work,work progress,worked,worker,workers,working,working lego,works,world,world war,worlds,worse,worst,worth,wouldn,wow,write,writing,written,wrong,wrote,wtf,ww2,wwii,www,xbox,xd,xi,xmas,xpost,ya,yard,yard sale,yavin,yeah,year,year ago,year old,year old boy,year old son,years,years ago,years later,years old,yellow,yes,yesterday,yo,yoda,yoda hut,york,young,younger,youtube,youtube channel,yr,yr old,zelda,zero,zombie,zone,zoo
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [178]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB()],
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs.best_params_}')
print(f'The best score from the grid seacrh: {gs.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.7min finished


The training score: 0.8656345565749235
The testing score: 0.8395108903324418
The best parameters from the grid search: {'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'learning_rate': 1.0, 'n_estimators': 5}
The best score from the grid seacrh: 0.7799120795107034
Generating predictions
time: 264.489 seconds
time: 4 minutes, 24.489 seconds
Accuracy score 0.8395108903324418  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1089                 220
Actually_Positive                 200    

In [1]:
import praw   # Python Reddit API Wrapper
import pandas as pd
import datetime as dt
import time

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 3000

In [2]:
# thank you for your function, Heather

def metrics(y_test, y_predict):
    print('Accuracy score %s ' % accuracy_score(y_test, y_predict), '\n')
    print('----------------------------------------------------------------')
    print(pd.DataFrame(confusion_matrix(y_test, y_predict), 
                            index=['Actually_Negative', 'Actually_Positive'], 
                            columns=['Predicted_Negative', 'Predicted_Positive']), '\n')
    print('-----------------------------------------------------------------')
    print(classification_report(y_test, y_predict))
    print('-----------------------------------------------------------------')

In [4]:
positive_df = pd.read_csv('data_pos_class.csv')
# positive_df.head()

negative_df = pd.read_csv('data_neg_class.csv')
negative_df['class'] = negative_df['class'].map(lambda x:0)
# negative_df.head()

df = pd.concat([positive_df, negative_df], axis=0)
df.shape

X = df['title']
y = df['class']

<font color = blue size = 4.5>
    We are attempting to tighten the variance.  We are of the opinion that we are only playing with the different parameters to see what happens.  This is really good practice for shaping me mind, and helping me learn how the models behave differently in certain circumstances.  In this model, the bag returned a model that is predicting slightly more false negatives, but also less false positives.  But, the overall accuracy is about the same as above.  The biggest takeaway difference thought is that the GridSearch decided that a better score was possible (a more accurate model) when it took bootstrapped over the features, taking only half of them at each construction.
</font>

In [5]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [10],
    'max_features'  : [0.5, 1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 22.3min finished


The training score: 0.8767201834862385
The testing score: 0.8467711119602599


NameError: name 'gs' is not defined

In [6]:
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)


The best parameters from the grid search: {'base_estimator': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'max_features': 0.5, 'n_estimators': 10}
The best score from the grid seacrh: 0.8297018348623854
Generating predictions


In [7]:
metrics(y_test, preds)


Accuracy score 0.8467711119602599  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1162                 147
Actually_Positive                 254                1054 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1309
           1       0.88      0.81      0.84      1308

   micro avg       0.85      0.85      0.85      2617
   macro avg       0.85      0.85      0.85      2617
weighted avg       0.85      0.85      0.85      2617

-----------------------------------------------------------------


<font color = blue size = 4.5>
    In the following we decided to increase the number of models being constructed at each step, and try bootstrapping over the features with a few other proportions.
</font>


In [8]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [10],
    'max_features'  : [0.4, 0.5, 0.6]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 21.4min finished


The training score: 0.8814984709480123
The testing score: 0.8414214749713412
The best parameters from the grid search: {'base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 'max_features': 0.5, 'n_estimators': 10}
The best score from the grid seacrh: 0.8249235474006116
Generating predictions
time: 1302.789 seconds
time: 21 minutes, 42.789 seconds
Accuracy score 0.8414214749713412  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1221                  88
Actually_Positive                 327                 981 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.7

In [9]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [20],
    'max_features'  : [0.5, 1]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 17.0min finished


The training score: 0.8891437308868502
The testing score: 0.8471532288880398
The best parameters from the grid search: {'base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 'max_features': 0.5, 'n_estimators': 20}
The best score from the grid seacrh: 0.829224006116208
Generating predictions
time: 1064.001 seconds
time: 17 minutes, 44.001 seconds
Accuracy score 0.8471532288880398  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1221                  88
Actually_Positive                 312                 996 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.80

In [11]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [30],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 41.1min finished


The training score: 0.9011850152905199
The testing score: 0.8418035918991211
The best parameters from the grid search: {'base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False), 'max_features': 1.0, 'n_estimators': 30}
The best score from the grid seacrh: 0.8276949541284404
Generating predictions
time: 2590.725 seconds
time: 43 minutes, 10.725 seconds
Accuracy score 0.8418035918991211  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1221                  88
Actually_Positive                 326                 982 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.7

<font color = blue size = 4.5>
    Next, we test the 4 models against each other to see which returns the best score.
</font>

In [12]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=5000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [1],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  1.5min finished


The training score: 0.8563646788990825
The testing score: 0.8234619793656859
The best parameters from the grid search: {'base_estimator': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'max_features': 1.0, 'n_estimators': 1}
The best score from the grid seacrh: 0.8156536697247706
Generating predictions
time: 95.754 seconds
time: 1 minutes, 35.754 seconds
Accuracy score 0.8234619793656859  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1060                 249
Actually_Positive                 213                1095 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1309
           1       0.81      0.84      0.83      1308

   micro avg       0.82      0.82      0.82      2617
   macro avg       0.82      0.82      0.82      2617
weighted avg  

<font color = blue size = 4.5>
    Here we increase the number for max_features in count vectorizer (the number of tokens) to 10,000, just to see if we can get our accuracy up a bit.
</font>

In [13]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=10_000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [1],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  2.8min finished


The training score: 0.8757645259938838
The testing score: 0.8288116163546045
The best parameters from the grid search: {'base_estimator': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'max_features': 1.0, 'n_estimators': 1}
The best score from the grid seacrh: 0.8220565749235474
Generating predictions
time: 174.949 seconds
time: 2 minutes, 54.949 seconds
Accuracy score 0.8288116163546045  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1051                 258
Actually_Positive                 190                1118 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      1309
           1       0.81      0.85      0.83      1308

   micro avg       0.83      0.83      0.83      2617
   macro avg       0.83      0.83      0.83      2617
weighted avg 

<font color = blue size = 4.5>
    Here, we increase the number of features again, expecting MultinomialNB to win out again, but with higher variance.
</font>

In [14]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=15_000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [1],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  3.8min finished


The training score: 0.891724006116208
The testing score: 0.8418035918991211
The best parameters from the grid search: {'base_estimator': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'max_features': 1.0, 'n_estimators': 1}
The best score from the grid seacrh: 0.8243501529051988
Generating predictions
time: 244.026 seconds
time: 4 minutes, 4.026 seconds
Accuracy score 0.8418035918991211  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1118                 191
Actually_Positive                 223                1085 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1309
           1       0.85      0.83      0.84      1308

   micro avg       0.84      0.84      0.84      2617
   macro avg       0.84      0.84      0.84      2617
weighted avg   

<font color = blue size = 4.5>
    Here, we increase the max_features in the tokenizer (CountVectorizer) to see if we can continue to increase our accuracy.
</font>

In [15]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=20_000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [1],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  5.0min finished


The training score: 0.8941131498470948
The testing score: 0.8391287734046619
The best parameters from the grid search: {'base_estimator': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'max_features': 1.0, 'n_estimators': 1}
The best score from the grid seacrh: 0.8237767584097859
Generating predictions
time: 321.214 seconds
time: 5 minutes, 21.214 seconds
Accuracy score 0.8391287734046619  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1108                 201
Actually_Positive                 220                1088 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1309
           1       0.84      0.83      0.84      1308

   micro avg       0.84      0.84      0.84      2617
   macro avg       0.84      0.84      0.84      2617
weighted avg 

<font color = blue size = 4.5>
    Sadly, our accuracy didn't increase any, and in fact, the score on the testing set decreased, causing our variance to increase.  Now, we will attempt to increase the number of ngrams, to see what effect this has on the model.
</font>

<font color = blue size = 4.5>
    Realizing that many phraises are common amoung lego builders, we increase the number of ngrams in an attempt to tune our model and increase accuracy even more.
</font>

In [16]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=20_000, ngram_range=(1,4), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier(), MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [1],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  5.2min finished


The training score: 0.8846521406727829
The testing score: 0.8261367978601452
The best parameters from the grid search: {'base_estimator': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True), 'max_features': 1.0, 'n_estimators': 1}
The best score from the grid seacrh: 0.8136467889908257
Generating predictions
time: 332.999 seconds
time: 5 minutes, 32.999 seconds
Accuracy score 0.8261367978601452  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1026                 283
Actually_Positive                 172                1136 

-----------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.86      0.78      0.82      1309
           1       0.80      0.87      0.83      1308

   micro avg       0.83      0.83      0.83      2617
   macro avg       0.83      0.83      0.83      2617
weighted avg 

<font color = blue size = 4.5>
    We choose to scale back the amount of data being trained on to 15_000 features, with at most 3-grams.  Here we will try and tighten the variance by actually ensembling.  We will set n_estimators = 30, and let the bag run again, understanding and expecting this to take a while.  Again, we are looking for a tighter fitting model, one that has less variance.
</font>

In [18]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=15_000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [RandomForestClassifier()],#, MultinomialNB(), None, LogisticRegression()],
    'n_estimators'  : [30],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 16.7min finished


The training score: 0.9624426605504587
The testing score: 0.8162017577378677
The best parameters from the grid search: {'base_estimator': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'max_features': 1.0, 'n_estimators': 30}
The best score from the grid seacrh: 0.8044724770642202
Generating predictions
time: 1566.657 seconds
time: 26 minutes, 6.657 seconds
Accuracy score 0.8162017577378677  

----------------------------------------------------------------
                   Predicted_Negative  Predicted_Positive
Actually_Negative                1125                 184
Actually_Positive                 297   

<font color = blue size = 4.5>
    Let's try the previous calculation using the Naive Bayes.
</font>

In [19]:
start_time = time.time()

import warnings
warnings.filterwarnings('ignore')

# We are using a bagging algorithm because our model is high variance.

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 28,
                                                    stratify = y,
                                                    test_size = 0.20)

# Let's instantiate a CountVectorizor, and build a model to see what we get.
cvec = CountVectorizer(max_features=15_000, ngram_range=(1,3), stop_words='english')

# training dataframe
df_train = pd.DataFrame(cvec.fit_transform(X_train).toarray(),
                        columns=cvec.get_feature_names())

# testing dataframe
df_test = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns=cvec.get_feature_names())

print('Outlining the bagging parameters.')
bag_params = {
    'base_estimator': [LogisticRegression()],
    'n_estimators'  : [50],
    'max_features'  : [1.0]
}

print(f'Instantiate a baggin model')
bag = BaggingClassifier()

print('Instantiating a grid search')
gs_bag = GridSearchCV(bag,
                      param_grid=bag_params,
                      verbose = 1)

gs_bag.fit(df_train, y_train)
print(f'The training score: {gs_bag.score(df_train, y_train)}')
print(f'The testing score: {gs_bag.score(df_test, y_test)}')
print(f'The best parameters from the grid search: {gs_bag.best_params_}')
print(f'The best score from the grid seacrh: {gs_bag.best_score_}')

print('Generating predictions')
preds = gs_bag.predict(df_test)

end_time = round(time.time() - start_time, 3)
print(f'time: {end_time} seconds')
end_time_minutes = int(end_time/ 60)
end_time_seconds = round(end_time % 60, 3)

print(f'time: {end_time_minutes} minutes, {end_time_seconds} seconds')
metrics(y_test, preds)


Outlining the bagging parameters.
Instantiate a baggin model
Instantiating a grid search
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 