# Import libraries

In [1]:
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Load data

In [2]:
df = pd.read_csv('../data/final.csv')

# Create a column for classification

In [3]:
df['sewing'] = df['subreddit'].apply(lambda x: 1 if x == 'sewing' else 0)

In [4]:
df['sewing'].value_counts(normalize=True)

1    0.502288
0    0.497712
Name: sewing, dtype: float64

In [5]:
df.head()

Unnamed: 0,subreddit,selftext,title,combined_text,sewing
0,sewing,none text,just finish thi late nowher near accur s inspi...,none text just finish thi late nowher near acc...,1
1,sewing,none text,too excit to not share almost finish with the ...,none text too excit to not share almost finish...,1
2,sewing,none text,a differ view of the thi piec set i made to we...,none text a differ view of the thi piec set i ...,1
3,sewing,none text,i made thi piec set for th of juli ruch detail...,none text i made thi piec set for th of juli r...,1
4,sewing,none text,first sew project sometim you gotta leap befor...,none text first sew project sometim you gotta ...,1


# Create X, y values

In [6]:
X = df['combined_text']
y = df['sewing']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

# Modeling with CountVectorizer

### Logistic regression

In [13]:
lr_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('lr', LogisticRegression())
])

lr_pipe_cvec_params = {
    'cvec__max_features': [100, 500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english'],
    'lr__C': [i for i in range(1, 101, 5)],
    'lr__penalty': ['l1', 'l2']
}

In [14]:
lr_cvec_gs = GridSearchCV(lr_pipe_cvec, param_grid=lr_pipe_cvec_params, cv=5)

In [15]:
lr_cvec_gs.fit(X_train, y_train)











































GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [16]:
lr_cvec_gs.best_score_

0.9261016949152543

In [17]:
lr_cvec_gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'lr__C': 1,
 'lr__penalty': 'l1'}

In [18]:
lr_cvec_gs.best_estimator_.score(X_test, y_test)

0.9024390243902439

##### Save the model

In [19]:
# A function to save models
def pickling(filename, model_obj):
    with open(filename, 'wb') as model:
        pickle.dump(model_obj , model)

In [20]:
pickling('../model/classification_models/logreg_cvec_gs.pickle', lr_cvec_gs)

#### K-neighbors

In [21]:
knn_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier())
])

knn_pipe_cvec_params = {
    'cvec__max_features': [100, 500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english'],
    'knn__n_neighbors': [i for i in range(1, 101, 5)],
    'knn__weights': ['uniform', 'distance'],
}

In [22]:
knn_cvec_gs = GridSearchCV(knn_pipe_cvec, param_grid=knn_pipe_cvec_params, cv=5)

In [23]:
knn_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [24]:
knn_cvec_gs.best_score_

0.8410169491525423

In [25]:
knn_cvec_gs.best_params_

{'cvec__max_features': 100,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 71,
 'knn__weights': 'distance'}

##### Save the model

In [26]:
pickling('../model/classification_models/knn_cvec_gs.pickle', knn_cvec_gs)

### Multinomial naive bayes

In [27]:
mnb_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('mnb', MultinomialNB())
])

mnb_pipe_cvec_params = {
    'cvec__max_features': [100, 500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english'],
    'mnb__alpha': [i * 0.01 for i in range(1, 101)]
}

In [28]:
mnb_cvec_gs = GridSearchCV(mnb_pipe_cvec, param_grid=mnb_pipe_cvec_params, cv=5)

In [29]:
mnb_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [30]:
mnb_cvec_gs.best_score_

0.9006779661016949

In [31]:
mnb_cvec_gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english',
 'mnb__alpha': 0.01}

##### Save the model

In [32]:
pickling('../model/classification_models/mnb_cvec_gs.pickle', mnb_cvec_gs)

### Random forest

In [33]:
rf_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())
])

rf_pipe_cvec_params = {
    'cvec__max_features': [100, 500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english'],
    'rf__n_estimators': [10, 30, 50, 70, 100, 130, 150],
    'rf__max_depth': [2, 3, 4, 5, 6, 7],
    'rf__min_samples_leaf': [1, 2, 3, 5, 7, 10]
}

In [34]:
rf_cvec_gs = GridSearchCV(rf_pipe_cvec, param_grid=rf_pipe_cvec_params, cv=5)

In [35]:
rf_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [36]:
rf_cvec_gs.best_score_

0.9193220338983051

In [37]:
rf_cvec_gs.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': 7,
 'rf__min_samples_leaf': 2,
 'rf__n_estimators': 130}

In [38]:
rf_cvec_gs.best_estimator_.score(X_test, y_test)

0.8922764227642277

##### Save the model

In [39]:
pickling('../model/classification_models/rf_cvec_gs.pickle', rf_cvec_gs)

### Support vector machine

In [77]:
svc_pipe_cvec = Pipeline([
    ('cvec', CountVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('svc', SVC())
])

svc_pipe_cvec_params = {
    'cvec__max_features': [100, 500],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': ['english'],
    'svc__gamma': ['scale'],
    'svc__C': [c**0.01 for c in range(1, 1001, 10)]
}

In [78]:
svc_cvec_gs = GridSearchCV(svc_pipe_cvec, param_grid=svc_pipe_cvec_params, cv=5)

In [79]:
svc_cvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [80]:
svc_cvec_gs.best_score_

0.892542372881356

##### Save the model

In [86]:
pickling('../model/classification_models/svc_cvec_gs.pickle', svc_cvec_gs)

# Modeling with TfidfVectorizer

### Logistic regression

In [40]:
lr_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('lr', LogisticRegression())
])

lr_pipe_tvec_params = {
    'tvec__max_features': [100, 500],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': ['english'],
    'lr__C': [i for i in range(1, 101, 5)],
    'lr__penalty': ['l1', 'l2']
}

In [41]:
lr_tvec_gs = GridSearchCV(lr_pipe_tvec, param_grid=lr_pipe_tvec_params, cv=5)

In [42]:
lr_tvec_gs.fit(X_train, y_train)









































GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [43]:
lr_tvec_gs.best_score_

0.9169491525423729

In [44]:
lr_tvec_gs.best_params_

{'lr__C': 1,
 'lr__penalty': 'l1',
 'tvec__max_features': 500,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

##### Save the model

In [45]:
pickling('../model/classification_models/logreg_tvec_gs.pickle', lr_tvec_gs)

### K-neighbors

In [46]:
knn_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('knn', KNeighborsClassifier())
])

knn_pipe_tvec_params = {
    'tvec__max_features': [100, 500],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': ['english'],
    'knn__n_neighbors': [i for i in range(1, 101, 5)],
    'knn__weights': ['uniform', 'distance'],
}

In [47]:
knn_tvec_gs = GridSearchCV(knn_pipe_tvec, param_grid=knn_pipe_tvec_params, cv=5)

In [48]:
knn_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [49]:
knn_tvec_gs.best_score_

0.8525423728813559

In [50]:
knn_tvec_gs.best_params_

{'knn__n_neighbors': 56,
 'knn__weights': 'distance',
 'tvec__max_features': 100,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

##### Save the model

In [51]:
pickling('../model/classification_models/knn_tvec_gs.pickle', knn_tvec_gs)

### Multinomial naive bayes

In [52]:
mnb_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('mnb', MultinomialNB())
])

mnb_pipe_tvec_params = {
    'tvec__max_features': [100, 500],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': ['english'],
    'mnb__alpha': [i * 0.01 for i in range(1, 101)]
}

In [53]:
mnb_tvec_gs = GridSearchCV(mnb_pipe_tvec, param_grid=mnb_pipe_tvec_params, cv=5)

In [54]:
mnb_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [55]:
mnb_tvec_gs.best_score_

0.9172881355932203

In [56]:
mnb_tvec_gs.best_params_

{'mnb__alpha': 0.01,
 'tvec__max_features': 500,
 'tvec__ngram_range': (1, 1),
 'tvec__stop_words': 'english'}

##### Save the model

In [57]:
pickling('../model/classification_models/mnb_tvec_gs.pickle', mnb_tvec_gs)

### Gaussian naive bayse

##### tvec max feature is 100

In [58]:
tvec_100 = TfidfVectorizer(max_features=100, stop_words='english', ngram_range=(1,2))

tvec_100_train = tvec_100.fit_transform(X_train)
tvec_100_test = tvec_100.transform(X_test)

In [59]:
gnb_tvec_100_s = None
cross_val_max_100 = 0
for s in range(1, 1001):
    gnb = GaussianNB(var_smoothing=(s*0.01))
    cross_val_temp = cross_val_score(gnb, tvec_100_train.todense(), y_train, cv=5).mean()
    if cross_val_temp >= cross_val_max_100:
        cross_val_max_100 = cross_val_temp
        gnb_tvec_100_s = s
    
print(gnb_tvec_100_s)
print(cross_val_max_100)

78
0.8837193500524227


##### tvec max feature is 500

In [8]:
tvec_500 = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1,2))

tvec_500_train = tvec_500.fit_transform(X_train)
tvec_500_test = tvec_500.transform(X_test)

In [61]:
gnb_tvec_500_s = None
cross_val_max_500 = 0
for s in range(1, 1001):
    gnb = GaussianNB(var_smoothing=(s*0.01))
    cross_val_temp = cross_val_score(gnb, tvec_500_train.todense(), y_train, cv=5).mean()
    if cross_val_temp >= cross_val_max_500:
        cross_val_max_500 = cross_val_temp
        gnb_tvec_500_s = s
    
print(gnb_tvec_500_s)
print(cross_val_max_500)

445
0.9291558241199744


##### Save tvec 500 model

In [62]:
gnb_tvec = GaussianNB(var_smoothing=4.45)
gnb_tvec.fit(tvec_500_train.todense(), y_train)

GaussianNB(priors=None, var_smoothing=4.45)

In [63]:
gnb_tvec.score(tvec_500_train.todense(), y_train)

0.9376271186440678

In [65]:
pickling('../model/classification_models/gnb_tvec.pickle', gnb_tvec)

### Random forest

In [66]:
rf_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

rf_pipe_tvec_params = {
    'tvec__max_features': [100, 500],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': ['english'],
    'rf__n_estimators': [10, 30, 50, 70, 100, 130, 150],
    'rf__max_depth': [2, 3, 4, 5, 6, 7],
    'rf__min_samples_leaf': [1, 2, 3, 5, 7, 10]
}

In [67]:
rf_tvec_gs = GridSearchCV(rf_pipe_tvec, param_grid=rf_pipe_tvec_params, cv=5)

In [68]:
rf_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [69]:
rf_tvec_gs.best_score_

0.9183050847457627

##### Save the model

In [70]:
pickling('../model/classification_models/rf_tvec_gs.pickle', rf_tvec_gs)

### Support vector machine

In [81]:
svc_pipe_tvec = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('ss', StandardScaler(with_mean=False)),
    ('svc', SVC())
])

svc_pipe_tvec_params = {
    'tvec__max_features': [100, 500],
    'tvec__ngram_range': [(1,1), (1,2)],
    'tvec__stop_words': ['english'],
    'svc__gamma': ['scale'],
    'svc__C': [c**0.01 for c in range(1, 1001, 10)]
}

In [82]:
svc_tvec_gs = GridSearchCV(svc_pipe_tvec, param_grid=svc_pipe_tvec_params, cv=5)

In [83]:
svc_tvec_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [84]:
svc_tvec_gs.best_score_

0.9074576271186441

##### Save the model

In [85]:
pickling('../model/classification_models/svc_tvec_gs.pickle', svc_tvec_gs)

##### Modeling is done. On next notebook, models are going to be evaluated