In [2]:
import pandas as pd
import numpy as np
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import confusion_matrix,precision_score,recall_score
import pickle

In [3]:
%store -r X_train_1
%store -r X_test_1
%store -r y_train_1
%store -r y_test_1

In [4]:
%store -r X_train_2
%store -r X_test_2
%store -r y_train_2
%store -r y_test_2

no stored variable X_train_2
no stored variable X_test_2
no stored variable y_train_2
no stored variable y_test_2


### 1st model, logistic regression

In [48]:
pipe_1 = Pipeline([('tvec', TfidfVectorizer()),
                  ('logreg', LogisticRegression())])

In [49]:
# pipe_1.get_params()

In [50]:
pipe_1_params = {
    'tvec__min_df':[2,3],
    'tvec__max_features':[6000,7000,8000],
    'tvec__stop_words':['english',None],
    'logreg__C':[1.3,1.35]
}

In [51]:
pipe_1_gs = GridSearchCV(pipe_1,pipe_1_params,cv=5,verbose=1)
pipe_1_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logreg', LogisticRegression())]),
             param_grid={'logreg__C': [1.3, 1.35],
                         'tvec__max_features': [6000, 7000, 8000],
                         'tvec__min_df': [2, 3],
                         'tvec__stop_words': ['english', None]},
             verbose=1)

In [31]:
# pickle.dump(pipe_1_gs, open('project3.pkl', 'wb'))

In [64]:
# pipe_1_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=6000, min_df=2,
                                 stop_words='english')),
                ('logreg', LogisticRegression(C=1.3))])

In [52]:
# score for training and testing data
pipe_1_gs.score(X_train_1,y_train_1),pipe_1_gs.score(X_test_1,y_test_1)

(0.9326666666666666, 0.8824)

### Pre 2nd Model, Ordinary Decision Tree

In [5]:
pipe_pre2 = Pipeline([('tvec',TfidfVectorizer()),
                      ('dc',DecisionTreeClassifier())])

In [9]:
# pipe_pre2.get_params()

In [8]:
pipe_pre2_params = {
    'tvec__min_df':[1,2,3],
    'tvec__max_features':[3000,4000],
    'tvec__stop_words':['english'],
    'dc__min_impurity_decrease':[0.0,0.3,0.5],
    'dc__min_samples_split':[2,3],
    'dc__min_samples_leaf':[1,2,5]
}

In [10]:
pipe_pre2_gs = GridSearchCV(pipe_pre2,pipe_pre2_params,cv=5,verbose=1)
pipe_pre2_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('dc', DecisionTreeClassifier())]),
             param_grid={'dc__min_impurity_decrease': [0.0, 0.3, 0.5],
                         'dc__min_samples_leaf': [1, 2, 5],
                         'dc__min_samples_split': [2, 3],
                         'tvec__max_features': [3000, 4000],
                         'tvec__min_df': [1, 2, 3],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [13]:
pipe_pre2_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=4000, min_df=2,
                                 stop_words='english')),
                ('dc', DecisionTreeClassifier())])

In [11]:
# score for training and testing data
pipe_pre2_gs.score(X_train_1,y_train_1),pipe_pre2_gs.score(X_test_1,y_test_1)

(0.9837333333333333, 0.8096)

### Overfitting
Definitely need try some ensemble methods.
Since overfitting is the main problem now, first ensemble method I'm going to try is a **Bagged Decision Tree**. 
More specifically, **Random Forest** is the first one comes in my mind, because when using Random Forest, at each splitting within each boostrapping sample, a **random subset of the features** are select. If one or few features (words in this case) are very strong preictors for the response variable, these words will be used in many/all of the bagged decision tree, causing them to become correlated and result in high variance and overfitting, by selecting a random subset of features at each split, it will counter this correlation between base trees.

### 2nd Model: Random Forest

In [16]:
pipe_2 = Pipeline([('tvec',TfidfVectorizer()),
                   ('rf',RandomForestClassifier(n_jobs=-1))])

In [18]:
# pipe_2.get_params()

In [30]:
pipe_2_params = {
#     'tvec__min_df':[1,2,3],
#     'tvec__max_features':[2000,3000,4000],
    'tvec__max_features':[4000],
    'tvec__stop_words':['english'],
#     'rf__n_estimators':[200,500]
    'rf__n_estimators':[700,1000]
#     'rf__max_depth':[None,3,5],
#     'rf__min_samples_leaf':[1,2,3]
}

In [31]:
pipe_2_gs = GridSearchCV(pipe_2,pipe_2_params,cv=5,verbose=1)
pipe_2_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('rf',
                                        RandomForestClassifier(n_jobs=-1))]),
             param_grid={'rf__n_estimators': [700, 1000],
                         'tvec__max_features': [4000],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [33]:
pipe_2_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=4000, stop_words='english')),
                ('rf', RandomForestClassifier(n_estimators=700, n_jobs=-1))])

In [32]:
pipe_2_gs.score(X_train_1,y_train_1),pipe_2_gs.score(X_test_1,y_test_1)

(0.9837333333333333, 0.862)

The overfitting problem is reduced

### 2.1 AdaBoost
Baaged Decision Tree(including Random Forest), words are treated independly, however, words ( written by human) couldn't be independly, so boosting decision tree might has better performance, since boosting decision tree is fitted sequentially instead of in parallel.
AdaBoost is the Boosting Decision Tree I'll try, however, the disadvantages of AdaBoost is it fits slow and is more prone to overfitting

In [35]:
pipe_2_1 = Pipeline([('tvec',TfidfVectorizer()),
                   ('ad',AdaBoostClassifier())])

In [37]:
pipe_2_1_params = {
#     'tvec__max_features':[3000,5000,7000],
    'tvec__max_features':[5000],
    'tvec__stop_words':['english'],
    'ad__base_estimator':[DecisionTreeClassifier(max_depth=2)],
#     'ad__learning_rate':[0.05,0.1,0.2],
    'ad__learning_rate':[0.1],
    'ad__n_estimators':[500,700]
                   }

In [39]:
pipe_2_1_gs = GridSearchCV(pipe_2_1,pipe_2_1_params,cv=5,verbose=1,n_jobs=-1)
pipe_2_1_gs.fit(X_train_1,y_train_1)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('ad', AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'ad__base_estimator': [DecisionTreeClassifier(max_depth=2)],
                         'ad__learning_rate': [0.1],
                         'ad__n_estimators': [500, 700],
                         'tvec__max_features': [5000],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [569]:
pipe_2_1_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=5000, stop_words='english')),
                ('ad',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                                    learning_rate=0.1, n_estimators=500))])

In [40]:
# score for training and testing data
pipe_2_1_gs.score(X_train_1,y_train_1),pipe_2_1_gs.score(X_test_1,y_test_1)

(0.9350666666666667, 0.85)

### Model 3.2. Predict Removed Posts
Trying to figure out why some post are deleted by the website. Is there any patterns?
Still working on

In [624]:
pipe_3_2 = Pipeline([('tvec', TfidfVectorizer()),
                  ('logreg', LogisticRegression(n_jobs=-1))])

pipe_3_2_params = {
#     'tvec__min_df':[1,2],
    'tvec__max_features':[3000,5000,6000,7000],
    'tvec__stop_words':['english'],
    'logreg__C':[1,2,3]
}

pipe_3_2_gs = GridSearchCV(pipe_3_2,pipe_3_2_params,cv=5,verbose=1)

In [625]:
pipe_3_2_gs.fit(X_train_2,y_train_2)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('logreg',
                                        LogisticRegression(n_jobs=-1))]),
             param_grid={'logreg__C': [1, 2, 3],
                         'tvec__max_features': [3000, 5000, 6000, 7000],
                         'tvec__stop_words': ['english']},
             verbose=1)

In [626]:
pipe_3_2_gs.best_estimator_

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_features=5000, stop_words='english')),
                ('logreg', LogisticRegression(C=2, n_jobs=-1))])

In [627]:
pipe_3_2_gs.score(X_train_2,y_train_2)

0.8509333333333333

In [628]:
pipe_3_2_gs.score(X_test_2,y_test_2)

0.7568

### 3rd Model: SVM

In [41]:
pipe_3 = Pipeline([('tvec',TfidfVectorizer()),
                   ('svm',SVC())])

In [42]:
pipe_3_params = {
    'tvec__min_df':[1,2,3],
    'tvec__max_features':[None,4000,5000],
#     'tvec__ngram_range':[(1,1),(2,2)],
    'svm__kernel':['rbf','poly'],
#     'svm__C':[0.1,1,10]
    'svm__C':[1]
}

In [45]:
# pipe_3_gs = GridSearchCV(pipe_3,pipe_3_params,cv=5,verbose=2)
# pipe_3_gs.fit(X_train_1,y_train_1)

In [46]:
pipe_3_gs.best_estimator_

Pipeline(steps=[('tvec', TfidfVectorizer()), ('svm', SVC(C=1))])

In [53]:
# score for training and testing data
pipe_3_gs.score(X_train_1,y_train_1),pipe_3_gs.score(X_test_1,y_test_1)

(0.98, 0.8844)

### Model 4 Naive Bayes

In [54]:
pipe_4 = Pipeline([('tvec', TfidfVectorizer()),
                  ('naive', MultinomialNB())])

In [55]:
pipe_4_params = {
    'tvec__min_df':[1,2,3],
    'tvec__max_features':[None,4000,5000],
    'tvec__ngram_range':[(1,1),(2,2)]
}

In [57]:
pipe_4_gs = GridSearchCV(pipe_4,pipe_4_params,cv=5,verbose=1)
pipe_4_gs.fit(X_train_1, y_train_1)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('naive', MultinomialNB())]),
             param_grid={'tvec__max_features': [None, 4000, 5000],
                         'tvec__min_df': [1, 2, 3],
                         'tvec__ngram_range': [(1, 1), (2, 2)]},
             verbose=1)

In [58]:
pipe_4_gs.best_estimator_

Pipeline(steps=[('tvec', TfidfVectorizer()), ('naive', MultinomialNB())])

In [59]:
pipe_4_gs.score(X_train_1,y_train_1),pipe_4_gs.score(X_test_1,y_test_1)

(0.9242666666666667, 0.87)

## Evaluating models

In [62]:
# calculate the precision and recall for 3 models
def precision_recall(X_test, y_true):
    preds_1 = pipe_1_gs.predict(X_test) # Logistic Regression
    preds_pre2 = pipe_pre2_gs.predict(X_test) # ordinary Decision Tree
    preds_2 = pipe_2_gs.predict(X_test) # Random Forest
    preds_2_1 = pipe_2_1_gs.predict(X_test) # Adaboost
    preds_3 = pipe_3_gs.predict(X_test) # SVM
    preds_4 = pipe_4_gs.predict(X_test) # Navie Bayes
    
    model_1 = ['Logistic Regression', precision_score(y_true,preds_1,pos_label='math'), recall_score(y_true,preds_1,pos_label='math')]
    model_pre2 = ['Decision Tree', precision_score(y_true,preds_pre2,pos_label='math'), recall_score(y_true,preds_pre2,pos_label='math')]
    model_2 = ['Random Forest', precision_score(y_true,preds_2,pos_label='math'), recall_score(y_true,preds_2,pos_label='math')]
    model_2_1 = ['Adaboost', precision_score(y_true,preds_2_1,pos_label='math'), recall_score(y_true,preds_2_1,pos_label='math')]
    model_3 = ['SVM', precision_score(y_true,preds_3,pos_label='math'), recall_score(y_true,preds_3,pos_label='math')]
    model_4 = ['Naive Bayes', precision_score(y_true,preds_4,pos_label='math'), recall_score(y_true,preds_4,pos_label='math')]

    result = pd.DataFrame([model_1,model_pre2,model_2,model_2_1,model_3,model_4],columns=['Model','Precision','Recall']).set_index('Model')
    return result

In [63]:
precision_recall(X_test_1,y_test_1)

Unnamed: 0_level_0,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.866564,0.904
Decision Tree,0.779221,0.864
Random Forest,0.830051,0.9104
Adaboost,0.804878,0.924
SVM,0.858315,0.9208
Naive Bayes,0.834418,0.9232


In [879]:
# random pick some 100 data from each subreddit at a given time and return the precision and recall of each model
# in the example bettwen, choosed 2 time periods

def evaluate_models():
    random.seed(42)
    # 11/24/2020 - 3/29/2021
    random_time_1 = random.choice(range(1606228102,1617024502))
    
    math_1 = get_submissions('math',1,random_time_1)
    physics_1 = get_submissions('Physics',1,random_time_1)
    
    math_1 = prepare(math_1)
    physics_1 = prepare(physics_1)
    
    data_1 =  pd.concat([math_1[['title_selftext_new','subreddit']],physics_1[['title_selftext_new','subreddit']]],axis=0,ignore_index=True)
    X_1 = data_1['title_selftext_new']
    y_1 = data_1['subreddit']
    
    # 3/1/2015 - 9/1/2015
    random_time_2 = random.choice(range(1425220102,1441114102))
    
    math_2 = get_submissions('math',1,random_time_2)
    physics_2 = get_submissions('Physics',1,random_time_2)
    
    math_2 = prepare(math_2)
    physics_2 = prepare(physics_2)
    
    data_2 =  pd.concat([math_2[['title_selftext_new','subreddit']],physics_2[['title_selftext_new','subreddit']]],axis=0,ignore_index=True)
    X_2 = data_2['title_selftext_new']
    y_2 = data_2['subreddit']

    df_1 = precision_recall(X_1,y_1)
    df_2 = precision_recall(X_2,y_2)
    
    return df_1,df_2


In [880]:
period_1 ,period_2 = evaluate_models()

In [881]:
period_1

Unnamed: 0_level_0,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.888889,0.96
Random Forest,0.934579,1.0
Adaboost,0.843478,0.97


In [882]:
period_2

Unnamed: 0_level_0,Precision,Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Logistic Regression,0.923077,0.96
Random Forest,0.834783,0.96
Adaboost,0.770492,0.94
