In [39]:
import pandas as pd
import numpy as np
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import confusion_matrix,precision_score,recall_score
import pickle

In [2]:
%store -r X_train_1
%store -r X_test_1
%store -r y_train_1
%store -r y_test_1

In [3]:
%store -r X_train_2
%store -r X_test_2
%store -r y_train_2
%store -r y_test_2

no stored variable X_train_2
no stored variable X_test_2
no stored variable y_train_2
no stored variable y_test_2


### 1st model, logistic regression

In [18]:
pipe_1 = Pipeline([('tvec', TfidfVectorizer()),
                  ('logreg', LogisticRegression())])

In [19]:
# pipe_1.get_params()

In [20]:
pipe_1_params = {
    'tvec__min_df':[2,3],
    'tvec__max_features':[6000,7000,8000],
    'tvec__stop_words':['english',None],
    'logreg__C':[1.3,1.35]
}

In [21]:
pipe_1_gs = GridSearchCV(pipe_1,pipe_1_params,cv=5,verbose=1)

In [22]:
pipe_1_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'logreg__C': [1.3, 1.35],
                         'tvec__max_features': [6000, 7000, 8000],
                         'tvec__min_df': [2, 3],
                         'tvec__stop_words': ['english', None]},
             verbose=1)

In [31]:
# pickle.dump(pipe_1_gs, open('project3.pkl', 'wb'))

In [23]:
pipe_1_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=6000, min_df=2,
                                 stop_words='english')),
                ('logreg', LogisticRegression(C=1.3))])

In [494]:
pipe_1_gs.best_score_

0.8765333333333334

In [735]:
pipe_1_gs.score(X_train_1,y_train_1),pipe_1_gs.score(X_test_1,y_test_1)

(0.9326666666666666, 0.8824)

### 2nd Model: Random Forest

In [648]:
pipe_2 = Pipeline([('tvec',TfidfVectorizer()),
                   ('rf',RandomForestClassifier(n_jobs=-1))])

In [649]:
# pipe_2.get_params()

In [651]:
pipe_2_params = {
#     'tvec__min_df':[1,2,3],
    'tvec__max_features':[3000,4000],
    'tvec__stop_words':['english'],
#     'tvec__ngram_range':[(1,1),(2,2)],
    'rf__n_estimators':[800,900]
#     'rf__max_depth':[None,3,5],
#     'rf__min_samples_leaf':[1,2]
}

In [652]:
pipe_2_gs = GridSearchCV(pipe_2,pipe_2_params,cv=5,verbose=1)

In [653]:
pipe_2_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('rf',
                                        RandomForestClassifier(n_jobs=-1))]),
             param_grid={'rf__n_estimators': [800, 900],
                         'tvec__max_features': [3000, 4000],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [654]:
pipe_2_gs.score(X_train_1,y_train_1)

0.9837333333333333

In [655]:
pipe_2_gs.score(X_test_1,y_test_1)

0.8652

In [883]:
# pipe_2_gs.best_estimator_

### 2.1 AdaBoost

In [37]:
pipe_2_1 = Pipeline([('tvec',TfidfVectorizer()),
                   ('ad',AdaBoostClassifier())])

NameError: name 'AdaBoostClassifier' is not defined

In [563]:
pipe_2_1_params = {
    'tvec__max_features':[3000,5000,7000],
    'tvec__stop_words':['english'],
    'ad__base_estimator':[DecisionTreeClassifier(max_depth=2)],
    'ad__learning_rate':[0.05,0.1,0.2],
    'ad__n_estimators':[500]
                   }

In [564]:
pipe_2_1_gs = GridSearchCV(pipe_2_1,pipe_2_1_params,cv=5,verbose=1,n_jobs=-1)

In [565]:
pipe_2_1_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('ad', AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'ad__base_estimator': [DecisionTreeClassifier(max_depth=2)],
                         'ad__learning_rate': [0.1], 'ad__n_estimators': [500],
                         'tvec__max_features': [5000], 'tvec__min_df': [1, 2],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [629]:
# score for training data
pipe_2_1_gs.score(X_train_1,y_train_1)

0.9142666666666667

In [676]:
# score for testing data
pipe_2_1_gs.score(X_test_1,y_test_1)

0.8432

In [569]:
pipe_2_1_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=5000, stop_words='english')),
                ('ad',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                    learning_rate=0.1, n_estimators=500))])

### Model 3.2. Predict Removed Posts

In [624]:
pipe_3_2 = Pipeline([('tvec', TfidfVectorizer()),
                  ('logreg', LogisticRegression(n_jobs=-1))])

pipe_3_2_params = {
#     'tvec__min_df':[1,2],
    'tvec__max_features':[3000,5000,6000,7000],
    'tvec__stop_words':['english'],
    'logreg__C':[1,2,3]
}

pipe_3_2_gs = GridSearchCV(pipe_3_2,pipe_3_2_params,cv=5,verbose=1)

In [625]:
pipe_3_2_gs.fit(X_train_2,y_train_2)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logreg',
                                        LogisticRegression(n_jobs=-1))]),
             param_grid={'logreg__C': [1, 2, 3],
                         'tvec__max_features': [3000, 5000, 6000, 7000],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [626]:
pipe_3_2_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=5000, stop_words='english')),
                ('logreg', LogisticRegression(C=2, n_jobs=-1))])

In [627]:
pipe_3_2_gs.score(X_train_2,y_train_2)

0.8509333333333333

In [628]:
pipe_3_2_gs.score(X_test_2,y_test_2)

0.7568

### 3rd Model: SVM

In [669]:
pipe_3 = Pipeline([('tvec',TfidfVectorizer()),
                   ('svm',SVC())])

In [670]:
pipe_3_params = {
    'tvec__min_df':[1,2,3],
    'tvec__max_features':[None,4000,5000],
    'tvec__ngram_range':[(1,1),(2,2)],
    'svm__kernel':['rbf','poly'],
    'svm__C':[0.1,1,10]
}

In [671]:
pipe_3_gs = GridSearchCV(pipe_3,pipe_3_params,cv=5,verbose=2)

In [673]:
# pipe_3_gs.fit(X_train_1,y_train_1)

In [529]:
pipe_3_gs.best_estimator_

Pipeline(steps=[('tvec', TfidfVectorizer()), ('svm', SVC(C=1))])

In [532]:
pipe_3_gs.score(X_train_1,y_train_1)

0.98

In [533]:
pipe_3_gs.score(X_test_1,y_test_1)

0.8844

### Model 4 Naive Bayes

In [29]:
pipe_4 = Pipeline([('tvec', TfidfVectorizer()),
                  ('naive', MultinomialNB())])

In [30]:
pipe_4_params = {
    'tvec__min_df':[1,2,3],
    'tvec__max_features':[None,4000,5000],
    'tvec__ngram_range':[(1,1),(2,2)]
}

In [33]:
pipe_4_gs = GridSearchCV(pipe_4,pipe_4_params,cv=5,verbose=1)

In [34]:
pipe_4_gs.fit(X_train_1, y_train_1)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('naive', MultinomialNB())]),
             param_grid={'tvec__max_features': [None, 4000, 5000],
                         'tvec__min_df': [1, 2, 3],
                         'tvec__ngram_range': [(1, 1), (2, 2)]},
             verbose=1)

In [36]:
pipe_4_gs.score(X_train_1,y_train_1),pipe_4_gs.score(X_test_1,y_test_1)

(0.9242666666666667, 0.87)

## Evaluating models

In [837]:
# calculate the precision and recall for 3 models
def precision_recall(X_test, y_true):
    preds_1 = pipe_1_gs.predict(X_test)
    preds_2 = pipe_2_gs.predict(X_test)
    preds_2_1 = pipe_2_1_gs.predict(X_test)
#     preds_3 = pipe_3_gs.predict(X_test_1)
    
    model_1 = ['Logistic Regression', precision_score(y_true,preds_1,pos_label='math'), recall_score(y_true,preds_1,pos_label='math')]
    model_2 = ['Random Forest', precision_score(y_true,preds_2,pos_label='math'), recall_score(y_true,preds_2,pos_label='math')]
    model_2_1 = ['Adaboost', precision_score(y_true,preds_2_1,pos_label='math'), recall_score(y_true,preds_2_1,pos_label='math')]

    result = pd.DataFrame([model_1,model_2,model_2_1],columns=['Model','Precision','Recall']).set_index('Model')
    return result

In [838]:
precision_recall(X_test_1,y_test_1)

Unnamed: 0_level_0,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.866564,0.904
Random Forest,0.832969,0.9136
Adaboost,0.793033,0.9288


In [879]:
# random pick some 100 data from each subreddit at a given time and return the precision and recall of each model
# in the example bettwen, choosed 2 time periods

def evaluate_models():
    random.seed(42)
    # 11/24/2020 - 3/29/2021
    random_time_1 = random.choice(range(1606228102,1617024502))
    
    math_1 = get_submissions('math',1,random_time_1)
    physics_1 = get_submissions('Physics',1,random_time_1)
    
    math_1 = prepare(math_1)
    physics_1 = prepare(physics_1)
    
    data_1 =  pd.concat([math_1[['title_selftext_new','subreddit']],physics_1[['title_selftext_new','subreddit']]],axis=0,ignore_index=True)
    X_1 = data_1['title_selftext_new']
    y_1 = data_1['subreddit']
    
    # 3/1/2015 - 9/1/2015
    random_time_2 = random.choice(range(1425220102,1441114102))
    
    math_2 = get_submissions('math',1,random_time_2)
    physics_2 = get_submissions('Physics',1,random_time_2)
    
    math_2 = prepare(math_2)
    physics_2 = prepare(physics_2)
    
    data_2 =  pd.concat([math_2[['title_selftext_new','subreddit']],physics_2[['title_selftext_new','subreddit']]],axis=0,ignore_index=True)
    X_2 = data_2['title_selftext_new']
    y_2 = data_2['subreddit']

    df_1 = precision_recall(X_1,y_1)
    df_2 = precision_recall(X_2,y_2)
    
    return df_1,df_2


In [880]:
period_1 ,period_2 = evaluate_models()

In [881]:
period_1

Unnamed: 0_level_0,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.888889,0.96
Random Forest,0.934579,1.0
Adaboost,0.843478,0.97


In [882]:
period_2

Unnamed: 0_level_0,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.923077,0.96
Random Forest,0.834783,0.96
Adaboost,0.770492,0.94
