In [32]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}

<IPython.core.display.Javascript object>

In [125]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB       # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [126]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('./data/data.csv')

In [77]:
df = pd.read_csv('./data/data.csv')

In [5]:
X = data['post']     # Setting the X and y
y = data['subreddit']

In [6]:
y.value_counts(normalize=True)   # Cheking base line 

0.0    0.505814
1.0    0.494186
Name: subreddit, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### Using a pipeline with Multinomial NB and Count Vectorizer

In [56]:
pipeCVnb = Pipeline([           # Using pipeline with CountVectorizer and MultinomialNB
    ('cvect', CountVectorizer()),
    ('Mnb', MultinomialNB())
])

pipeCVnb_params = {         # Setting the parameters 
               'cvect__max_features': [150_000, 200_000], 
               'cvect__stop_words': [None, 'english'],
               'cvect__ngram_range': [(1,3),(1,2)] 
              }

# Using the GridSearchCV to find best parameters
gsCVnb = GridSearchCV(pipeCVnb, # What is the model we want to fit?
                  pipeCVnb_params, # What is the dictionary of hyperparameters?
                  cv=3, verbose=1)

gsCVnb.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsCVnb.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsCVnb.best_params_}') # Checking best parameters for CV

gsCVnb_model = gsCVnb.best_estimator_     # Setting our best model

print(f'Train score: {gsCVnb_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsCVnb_model.score(X_test, y_test)}')    # Scoring test data on our best model 

os.system("printf '\a'");

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 14.5min finished


Best Score for cv: 0.9256774511414025
Best Parameters: {'cvect__max_features': 200000, 'cvect__ngram_range': (1, 2), 'cvect__stop_words': 'english'}
Train score: 0.9415257020857284
Test score: 0.9259496477311918


In [58]:
# Get report on grid search results
best_model_params = pd.DataFrame(gsCVnb.cv_results_)

In [60]:
best_model_params.to_csv('best_model.csv', index=False)

In [75]:
best_model_params.drop(columns=['mean_fit_time','std_fit_time','mean_score_time','std_score_time',
                               'params', 'split0_test_score','split1_test_score', 'split2_test_score',
                               'std_test_score']).sort_values('rank_test_score')

Unnamed: 0,param_cvect__max_features,param_cvect__ngram_range,param_cvect__stop_words,mean_test_score,rank_test_score
7,200000,"(1, 2)",english,0.925677,1
3,150000,"(1, 2)",english,0.925579,2
1,150000,"(1, 3)",english,0.925497,3
5,200000,"(1, 3)",english,0.925439,4
6,200000,"(1, 2)",,0.910076,5
2,150000,"(1, 2)",,0.909739,6
4,200000,"(1, 3)",,0.906339,7
0,150000,"(1, 3)",,0.905494,8


### Using a pipeline with Multinomial NB and TfidfVectorizer

In [11]:
pipeTIFInb = Pipeline([       # Using pipeline with Tifidifi and MultinomialNB
    ('tfi', TfidfVectorizer(lowercase=False)),
    ('Mnb', MultinomialNB())
])    

pipeTIFInb_params = {       # Setting the parameters
               'tfi__max_features': [75_000],
               'tfi__stop_words': ['english'],
               'tfi__ngram_range': [(1,3),(1,2)]
              }

gsTIFInb = GridSearchCV(pipeTIFInb, # What is the model we want to fit?
                  pipeTIFInb_params, # What is the dictionary of hyperparameters?
                  cv=3, n_jobs=-1, verbose=2)

gsTIFInb.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsTIFInb.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsTIFInb.best_params_}') # Checking best parameters for CV

gsTIFInb_model = gsTIFInb.best_estimator_     # Setting our best model

print(f'Train score: {gsTIFInb_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsTIFInb_model.score(X_test, y_test)}')    # Scoring test data on our best model 

os.system("printf '\a'");


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   43.6s remaining:   43.6s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.3min finished


Best Score for cv: 0.9183199211693217
Best Parameters: {'tfi__max_features': 75000, 'tfi__ngram_range': (1, 2), 'tfi__stop_words': 'english'}
Train score: 0.9311299063885695
Test score: 0.9179435384539587


0

In [20]:
# Get report on grid search results
pd.DataFrame(gsTIFInb.cv_results_)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfi__max_features,param_tfi__ngram_range,param_tfi__stop_words,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,67.051101,0.046291,5.947995,0.059553,75000,"(1, 3)",english,"{'tfi__max_features': 75000, 'tfi__ngram_range...",0.916465,0.917081,0.918752,0.917433,0.000966,2
1,33.522923,0.21978,6.591595,0.10166,75000,"(1, 2)",english,"{'tfi__max_features': 75000, 'tfi__ngram_range...",0.917402,0.917993,0.919565,0.91832,0.000913,1


### Using a pipeline with Random Forest and Count Vectorizer

In [216]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [21]:
pipeCVrf = Pipeline([           # Using pipeline with CountVectorizer and MultinomialNB
    ('cvect', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

pipeCVrf_params = {         # Setting the parameters 
               'cvect__max_features': [50_000], 
               'cvect__stop_words': ['english'],
               'cvect__ngram_range': [(1,1)], 
              }

# Using the GridSearchCV to find best parameters
gsCVrf = GridSearchCV(pipeCVrf, # What is the model we want to fit?
                  pipeCVrf_params, # What is the dictionary of hyperparameters?
                  cv=3, n_jobs=-1)

gsCVrf.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsCVrf.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsCVrf.best_params_}') # Checking best parameters for CV

gsCVrf_model = gsCVrf.best_estimator_     # Setting our best model

print(f'Train score: {gsCVrf_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsCVrf_model.score(X_test, y_test)}')    # Scoring test data on our best model 



Best Score for cv: 0.9009361143044835
Best Parameters: {'cvect__max_features': 50000, 'cvect__ngram_range': (1, 1), 'cvect__stop_words': 'english'}
Train score: 0.9944818525209393
Test score: 0.9043701039562497


In [22]:
os.system('say -v Samantha Your code is done running');

### Using a pipeline with Random Forest and Tfid Vectorizer

In [78]:
pipeTIFIrf = Pipeline([           # Using pipeline with CountVectorizer and MultinomialNB
    ('tfi', TfidfVectorizer(lowercase=False)),
    ('rf', RandomForestClassifier())
])

pipeTIFIrf_params = {         # Setting the parameters 
               'tfi__max_features': [50_000], 
               'tfi__stop_words': ['english'],
               'tfi__ngram_range': [(1,1)], 
              }

# Using the GridSearchCV to find best parameters
gsTIFIrf = GridSearchCV(pipeTIFIrf, # What is the model we want to fit?
                  pipeTIFIrf_params, # What is the dictionary of hyperparameters?
                  cv=3, n_jobs=-1)

gsTIFIrf.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsTIFInb.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsTIFInb.best_params_}') # Checking best parameters for CV

gsTIFInb_model = gsTIFInb.best_estimator_     # Setting our best model

print(f'Train score: {gsTIFInb_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsTIFInb_model.score(X_test, y_test)}')    # Scoring test data on our best model 



Best Score for cv: 0.9170471341763836
Best Parameters: {'tfi__max_features': 50000, 'tfi__ngram_range': (1, 2), 'tfi__stop_words': 'english'}
Train score: 0.9280916406634916
Test score: 0.9170567078878652


### Using a pipeline with Extra Trees and Count Vectorizer

In [80]:
pipeCVet = Pipeline([           # Using pipeline with CountVectorizer and MultinomialNB
    ('cvect', CountVectorizer()),
    ('et', ExtraTreesClassifier())
])

pipeCVet_params = {         # Setting the parameters 
               'cvect__max_features': [50_000], 
               'cvect__stop_words': ['english'],
               'cvect__ngram_range': [(1,1)], 
              }

# Using the GridSearchCV to find best parameters
gsCVet = GridSearchCV(pipeCVet, # What is the model we want to fit?
                  pipeCVet_params, # What is the dictionary of hyperparameters?
                  cv=3, n_jobs=-1)

gsCVet.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsCVet.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsCVet.best_params_}') # Checking best parameters for CV

gsCVet_model = gsCVet.best_estimator_     # Setting our best model

print(f'Train score: {gsCVet_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsCVet_model.score(X_test, y_test)}')    # Scoring test data on our best model 



Best Score for cv: 0.8975201182460174
Best Parameters: {'cvect__max_features': 50000, 'cvect__ngram_range': (1, 1), 'cvect__stop_words': 'english'}
Train score: 0.998127771391033
Test score: 0.8979405823520717


### Using a pipeline with Extra Trees and Tfid Vectorizer

In [84]:
pipeTIFIet = Pipeline([           # Using pipeline with CountVectorizer and MultinomialNB
    ('tfi', TfidfVectorizer(lowercase=False)),
    ('et', ExtraTreesClassifier())
])

pipeTIFIet_params = {         # Setting the parameters 
               'tfi__max_features': [50_000], 
               'tfi__stop_words': ['english'],
               'tfi__ngram_range': [(1,1)], 
              }

# Using the GridSearchCV to find best parameters
gsTIFIet = GridSearchCV(pipeTIFIet, # What is the model we want to fit?
                  pipeTIFIet_params, # What is the dictionary of hyperparameters?
                  cv=3, n_jobs=-1)

gsTIFIet.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsTIFIet.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsTIFIet.best_params_}') # Checking best parameters for CV

gsTIFIet_model = gsTIFIet.best_estimator_     # Setting our best model

print(f'Train score: {gsTIFIet_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsTIFIet_model.score(X_test, y_test)}')    # Scoring test data on our best model 



Best Score for cv: 0.9027754967975037
Best Parameters: {'tfi__max_features': 50000, 'tfi__ngram_range': (1, 1), 'tfi__stop_words': 'english'}
Train score: 0.9984233864345541
Test score: 0.9058481548997389


### Using a pipeline with Logistic Regression and Tfid Vectorizer

In [110]:
pipeTIFIlr = Pipeline([           # Using pipeline with TfidVectorizer and Logistic Regression
    ('tfi', TfidfVectorizer(lowercase=False)),
    ('lr', LogisticRegression())
])

pipeTIFIlr_params = {         # Setting the parameters 
               'tfi__max_features': [50_000, 75_000, 100_000], 
               'tfi__stop_words': [None, 'english'],
               'tfi__ngram_range': [(1,1),(1,2)]
              }

# Using the GridSearchCV to find best parameters
gsTIFIlr = GridSearchCV(pipeTIFIlr, # What is the model we want to fit?
                  pipeTIFIlr_params, # What is the dictionary of hyperparameters?
                  cv=3, n_jobs=-1)

gsTIFIlr.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsTIFIlr.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsTIFIlr.best_params_}') # Checking best parameters for CV

gsTIFIlr_model = gsTIFIlr.best_estimator_     # Setting our best model

print(f'Train score: {gsTIFIlr_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsTIFIlr_model.score(X_test, y_test)}')    # Scoring test data on our best model 



Best Score for cv: 0.9366069962226967
Best Parameters: {'tfi__max_features': 75000, 'tfi__ngram_range': (1, 2), 'tfi__stop_words': 'english'}
Train score: 0.9564214156675973
Test score: 0.9388333251219392


In [111]:
# Get report on grid search results
best_model_params = pd.DataFrame(gsTIFIlr.cv_results_)

In [123]:
best_model_params

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfi__max_features,param_tfi__ngram_range,param_tfi__stop_words,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,20.139083,0.176619,8.409898,0.060415,50000,"(1, 1)",,"{'tfi__max_features': 50000, 'tfi__ngram_range...",0.935508,0.934646,0.934864,0.935006,0.000366,7
1,18.161496,0.197442,7.322226,0.156035,50000,"(1, 1)",english,"{'tfi__max_features': 50000, 'tfi__ngram_range...",0.936542,0.936567,0.935603,0.936237,0.000449,4
2,63.464119,0.886825,15.866807,0.6962,50000,"(1, 2)",,"{'tfi__max_features': 50000, 'tfi__ngram_range...",0.933537,0.933389,0.933337,0.933421,8.5e-05,12
3,52.885771,0.686028,11.923641,0.403163,50000,"(1, 2)",english,"{'tfi__max_features': 50000, 'tfi__ngram_range...",0.93701,0.936912,0.935332,0.936418,0.000769,2
4,21.804713,2.162045,8.673323,0.246077,75000,"(1, 1)",,"{'tfi__max_features': 75000, 'tfi__ngram_range...",0.935385,0.934596,0.934864,0.934948,0.000327,8
5,15.715034,0.189433,6.660531,0.952548,75000,"(1, 1)",english,"{'tfi__max_features': 75000, 'tfi__ngram_range...",0.936419,0.936099,0.935431,0.935983,0.000412,5
6,60.404871,0.494016,15.625234,0.290877,75000,"(1, 2)",,"{'tfi__max_features': 75000, 'tfi__ngram_range...",0.93334,0.933562,0.933386,0.933429,9.6e-05,11
7,50.222171,0.657336,11.366511,0.574766,75000,"(1, 2)",english,"{'tfi__max_features': 75000, 'tfi__ngram_range...",0.936739,0.937454,0.935628,0.936607,0.000751,1
8,19.597433,0.947566,7.556967,0.044107,100000,"(1, 1)",,"{'tfi__max_features': 100000, 'tfi__ngram_rang...",0.935212,0.934522,0.934741,0.934825,0.000288,9
9,16.093993,1.641855,6.742985,0.191995,100000,"(1, 1)",english,"{'tfi__max_features': 100000, 'tfi__ngram_rang...",0.936468,0.936173,0.93516,0.935934,0.00056,6


In [112]:
best_model_params.to_csv('best_model.csv', index=False)

#### Trying out my best model with 115 rows of new data

In [115]:
math_for_predictions = pd.read_csv('math_for_predictions.csv')

In [116]:
A = math_for_predictions['post']

In [122]:
A.shape

(115,)

In [118]:
gsTIFIlr_model.predict(A)

array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [120]:
A[0]

'Who’s in Full Burn Out Mode? The burn out is real.'

### Using a pipeline with Logistic Regression and Count Vectorizer


In [103]:
pipeCVlr = Pipeline([           # Using pipeline with CountVectorizer and Logistic Regression
    ('cvect', CountVectorizer()),
    ('lr', LogisticRegression())
])

pipeCVlr_params = {         # Setting the parameters 
               'cvect__max_features': [100_000, 200_000], 
               'cvect__stop_words': ['english'],
               'cvect__ngram_range': [(1,2),(1,3)], 
              }

# Using the GridSearchCV to find best parameters
gsCVlr = GridSearchCV(pipeCVlr, # What is the model we want to fit?
                  pipeCVlr_params, # What is the dictionary of hyperparameters?
                  cv=3, verbose=1)

gsCVlr.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsCVlr.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsCVlr.best_params_}') # Checking best parameters for CV

gsCVlr_model = gsCVlr.best_estimator_     # Setting our best model

print(f'Train score: {gsCVlr_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsCVlr_model.score(X_test, y_test)}')    # Scoring test data on our best model 



Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  9.4min finished


Best Score for cv: 0.9355887666283462
Best Parameters: {'cvect__max_features': 200000, 'cvect__ngram_range': (1, 2), 'cvect__stop_words': 'english'}
Train score: 0.984455575628182
Test score: 0.9389564960338966


### Using a pipeline with Vote Classifier and Count Vectorizer


In [128]:
pipeTIFIet = Pipeline([           # Using pipeline with Vote Classifier
    ('tfi', TfidfVectorizer(lowercase=False)),
    ('vote', VotingClassifier([
    ('lr', LogisticRegression()),
    ('MNB', MultinomialNB()),
    ('et', ExtraTreesClassifier()),
    ('rf', RandomForestClassifier())]))
])

pipeTIFIet_params = {         # Setting the parameters 
               'tfi__max_features': [75_000, 100_000], 
               'tfi__stop_words': ['english'],
               'tfi__ngram_range': [(1,1),(1,2)], 
              }

# Using the GridSearchCV to find best parameters
gsTIFIet = GridSearchCV(pipeTIFIet, # What is the model we want to fit?
                  pipeTIFIet_params, # What is the dictionary of hyperparameters?
                  cv=3, verbose=1)

gsTIFIet.fit(X_train, y_train)   # Fitting the train data

print(f'Best Score for cv: {gsTIFIet.best_score_}')   # Printing best score for CV

print(f'Best Parameters: {gsTIFIet.best_params_}') # Checking best parameters for CV

gsTIFIet_model = gsTIFIet.best_estimator_     # Setting our best model

print(f'Train score: {gsTIFIet_model.score(X_train, y_train)}')    # Scoring train data on our best model 

print(f'Test score: {gsTIFIet_model.score(X_test, y_test)}')    # Scoring test data on our best model 


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 19.3min finished


Best Score for cv: 0.9302841189029397
Best Parameters: {'tfi__max_features': 100000, 'tfi__ngram_range': (1, 2), 'tfi__stop_words': 'english'}
Train score: 0.9854327475775989
Test score: 0.930211361284919
