# Import libraries

In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import recall_score, precision_score

# Load data

In [2]:
df = pd.read_csv('../data/final.csv')

df['sewing'] = df['subreddit'].apply(lambda x: 1 if x == 'sewing' else 0)

X = df['combined_text']
y = df['sewing']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

# Instantiate models and compare the scores

In [4]:
# load model function
def load_model(filename):
    with open(filename, 'rb') as f:
        model = pickle.load(f)
        
    return model

In [5]:
# caluculate scores
def cal_scores(model_filename, test_X, test_y):
    scores = {}
    
    model = load_model(model_filename)
    
    scores['best_score'] = model.best_score_
    
    scores['test_score'] = model.best_estimator_.score(test_X, test_y)
    
    scores['recall'] = recall_score(model.best_estimator_.predict(test_X), test_y)
    
    scores['precision'] = precision_score(model.best_estimator_.predict(test_X), test_y)
    
    return scores

### Logistic regression with cvec

In [6]:
lr_cvec_scores = cal_scores('../model/classification_models/logreg_cvec_gs.pickle', X_test, y_test)

In [7]:
lr_cvec_scores

{'best_score': 0.9261016949152543,
 'test_score': 0.9024390243902439,
 'recall': 0.9402654867256637,
 'precision': 0.8603238866396761}

### KNN with cvec

In [8]:
knn_cvec_scores = cal_scores('../model/classification_models/knn_cvec_gs.pickle', X_test, y_test)

In [9]:
knn_cvec_scores

{'best_score': 0.8410169491525423,
 'test_score': 0.8150406504065041,
 'recall': 0.8861386138613861,
 'precision': 0.7246963562753036}

### Multinomial naive bayes with cvec

In [10]:
mnb_cvec_scores = cal_scores('../model/classification_models/mnb_cvec_gs.pickle', X_test, y_test)

In [11]:
mnb_cvec_scores

{'best_score': 0.9006779661016949,
 'test_score': 0.8953252032520326,
 'recall': 0.8654205607476636,
 'precision': 0.937246963562753}

### Random forest with cvec

In [12]:
rf_cvec_scores = cal_scores('../model/classification_models/rf_cvec_gs.pickle', X_test, y_test)

In [13]:
rf_cvec_scores

{'best_score': 0.9193220338983051,
 'test_score': 0.8922764227642277,
 'recall': 0.9641148325358851,
 'precision': 0.8157894736842105}

### Support vector machine with cvec

In [14]:
svc_cvec_scores = cal_scores('../model/classification_models/svc_cvec_gs.pickle', X_test, y_test)

In [15]:
svc_cvec_scores

{'best_score': 0.892542372881356,
 'test_score': 0.8699186991869918,
 'recall': 0.9399038461538461,
 'precision': 0.791497975708502}

### Logistic regression with tvec

In [16]:
lr_tvec_scores = cal_scores('../model/classification_models/logreg_tvec_gs.pickle', X_test, y_test)

In [17]:
lr_tvec_scores

{'best_score': 0.9169491525423729,
 'test_score': 0.9034552845528455,
 'recall': 0.9290322580645162,
 'precision': 0.8744939271255061}

### KNN with tvec

In [18]:
knn_tvec_scores = cal_scores('../model/classification_models/knn_tvec_gs.pickle', X_test, y_test)

In [19]:
knn_tvec_scores

{'best_score': 0.8525423728813559,
 'test_score': 0.8323170731707317,
 'recall': 0.853763440860215,
 'precision': 0.8036437246963563}

### Multinomial naive bayes with tvec

In [20]:
mnb_tvec_scores = cal_scores('../model/classification_models/mnb_tvec_gs.pickle', X_test, y_test)

In [21]:
mnb_tvec_scores

{'best_score': 0.9172881355932203,
 'test_score': 0.9024390243902439,
 'recall': 0.898,
 'precision': 0.9089068825910931}

### Gaussian naive bayes with tvec

I didn't use GridSearch for Gaussian naive bayes, but the best cross validation score was about 0.929 in '03_Modeling.ipynb'. This pickled model doesn't have best score in the object, so those scores are calculated without the function defined above.

In [23]:
gnb_tvec = load_model('../model/classification_models/gnb_tvec.pickle')

In [24]:
tvec_500 = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1,2))

tvec_500_train = tvec_500.fit_transform(X_train)
tvec_500_test = tvec_500.transform(X_test)

In [25]:
gnb_tvec_scores = {
    'best_score': 0.929,
    'test_score': gnb_tvec.score(tvec_500_test.todense(), y_test),
    'recall': recall_score(gnb_tvec.predict(tvec_500_test.todense()), y_test),
    'precision': precision_score(gnb_tvec.predict(tvec_500_test.todense()), y_test)
}

In [26]:
gnb_tvec_scores

{'best_score': 0.929,
 'test_score': 0.9217479674796748,
 'recall': 0.9503239740820735,
 'precision': 0.8906882591093117}

##### Save the vectorizer for an app

In [27]:
with open('../model/transformers/gnb_vectorizer.pickle', 'wb') as model:
        pickle.dump(tvec_500, model)

### Random forest with tvec

In [28]:
rf_tvec_scores = cal_scores('../model/classification_models/rf_tvec_gs.pickle', X_test, y_test)

In [29]:
rf_tvec_scores

{'best_score': 0.9183050847457627,
 'test_score': 0.8800813008130082,
 'recall': 0.9541062801932367,
 'precision': 0.7995951417004049}

### Support vector machine with tvec

In [31]:
svc_tvec_scores = cal_scores('../model/classification_models/svc_tvec_gs.pickle', X_test, y_test)

In [32]:
svc_tvec_scores

{'best_score': 0.9074576271186441,
 'test_score': 0.8973577235772358,
 'recall': 0.9225806451612903,
 'precision': 0.868421052631579}

### Make a score table

|  | Best score | Test score | Recall | Precision |
|:---------|:---------|:---------|:---------| |
|**Logistic regression with cvec** | 0.926 | 0.902 | 0.940 | 0.860 |
|**KNN with cvec** | 0.841 | 0.815 | 0.886 | 0.725 |
|**Mulitinoimal naive bayes with cvec** | 0.901 | 0.895 | 0.865 | 0.937 |
|**Random forest with cvec** | 0.919 | 0.892 | 0.964 | 0.816 |
|**SVC with cvec** | 0.893 | 0.870 | 0.940 | 0.791 |
|**Logistic regression with tvec** | 0.917 | 0.903 | 0.929 | 0.874 |
|**KNN with tvec**| 0.853 | 0.832 | 0.854 | 0.804 |
|**Multinomial naive bayes with tvec** | 0.917 | 0.902 | 0.898 | 0.909 |
|**Gaussian naive bayes with tvec**| 0.929 | 0.922 | 0.950 | 0.891 |
|**Random forest with tvec**| 0.918 | 0.880 | 0.954 | 0.800 |
|**SVC with tvec**| 0.907 | 0.897 | 0.923 | 0.868 |

# Summary

I chose the gaussian naive bayes as the best model, because it is the highest score for the test score and looks like it is not overfitting compared to other models.

The parameters of the model is TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1,2) and GaussianNB(var_smoothing=4.45)

This projet is impartial for both subreddits since there is no weight on each subreddit unlike a spam filter. However, if this model for the spam filter, we want to prevent that a ham would go to a spam mail folder. In that case, we increase the precision score changing the model threshold and reducing the false positives. But the recall and precision is a trade-off relationship, so we have to care about both scores.

*I made an app to judge which subreddit a post is from. The code is in app folder.