Import libraries

In [1]:
import pandas as pd
import numpy as np
import regex as re


from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

In [2]:
# Read in the data from the machinelearning and datascience
clean = pd.read_csv('./data/clean.csv')


Unnamed: 0,title,selftext,clean_title,selftext_urls,title_urls,clean_selftext,created_utc,num_comments,num_crossposts,score,subreddit,Subreddit_name,merged
0,[D] Hinton responds to Schmidhuber,,hinton responds schmidhuber,,,,1587609168,0,0,1,MachineLearning,0,hinton responds schmidhuber
1,Hinton responds to Schmidhuber,,hinton responds schmidhuber,,,,1587609111,1,0,1,MachineLearning,0,hinton responds schmidhuber


In [3]:
clean.isnull().sum()

title                 0
selftext          16002
clean_title          30
selftext_urls     31661
title_urls        39720
clean_selftext    16007
created_utc           0
num_comments          0
num_crossposts        0
score                 0
subreddit             0
Subreddit_name        0
merged               20
dtype: int64

In [4]:
# If there are NA's in clean_title. Remove them
if clean.clean_title.isna().sum() > 0:
    clean.drop(labels = clean[clean.clean_title.isna()].index, inplace = True)

Define a function to clean out our data

#### Begin the modeling process

- Here lets create our X and y variables
- Fit our transformers and create our subsequent models. 

In [5]:
# Create X variable

X = clean['merged']
y = clean['Subreddit_name']

X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

To create a custom stopwords list, lets export sklearn's english stopwords, and add in our own list of stop words. 

In [6]:
# ### EDA - sklearn's stopwords, extracted
sklearn_stopwords = list(CountVectorizer(stop_words = 'english').get_stop_words())

#Custom created list
custom_stopwords = ['good','time','python','tool','source','best',
                    'learn','science',
                    'data','learning','science'] # most common words

# Personalized stopwords
personal_stopwords = sklearn_stopwords + custom_stopwords

### Instantiate TFIDF and create transformation

In [7]:
# Instantiate TFIDF
vec = TfidfVectorizer(stop_words = personal_stopwords, 
                      max_features = 173,
                      max_df = 0.8,
                      min_df = 5
                     )
vec.fit(X_train,y_train)
# Extract features names for future use
feature_names = vec.get_feature_names()

# Transform our data: Train/Test
V_train = vec.transform(X_train)
V_test = vec.transform(X_test)


# convert sparse matrix to dataframe
transformed_train_df = pd.DataFrame(V_train.toarray(), 
                             columns = feature_names)

# convert sparse matrix to dataframe
transformed_test_df = pd.DataFrame(V_test.toarray(), 
                             columns = feature_names)

In [8]:
print(transformed_train_df.shape)
print(transformed_test_df.shape)

(29837, 173)
(9946, 173)


### Logistic Regression

In [9]:
%%time
# Instantiate Logistic Regression
lr = LogisticRegression(penalty = 'l2', 
                        C = 0.8,
                        random_state = 42)
# Fit
lr.fit(transformed_train_df, y_train)

# Export coeficients to series
lr_coef = pd.Series(np.exp(lr.coef_[0]), index = feature_names)

# Print Train Scores
print(cross_val_score(lr, transformed_train_df, y_train))
print(f"Train Accuracy Score: {lr.score(transformed_train_df, y_train)}")
print()

# Print Train Scores
print(cross_val_score(lr, transformed_test_df, y_test))
print(f"Test Accuracy Score: {lr.score(transformed_test_df, y_test)}")

[0.77345845 0.7622319  0.76755489 0.76487347 0.7717446 ]
Train Accuracy Score: 0.7713912256594162

[0.75577889 0.75213675 0.76772247 0.75615887 0.78079437]
Test Accuracy Score: 0.7690528855821436
CPU times: user 5.83 s, sys: 374 ms, total: 6.2 s
Wall time: 3.94 s


### Apply StandardScaler
We will apply StandardScaler to our vectorized train and test data to input into SVC

In [10]:
# Instantiate StandardScaler
ss = StandardScaler()

# fit the transformation to train
ss.fit(transformed_train_df, y_train)

# transform train/test
SS_train = ss.transform(transformed_train_df)
SS_test = ss.transform(transformed_test_df)

### Logistic Regression with StandardScaler

In [11]:
# reapplying a Logistic Regresion
tr = LogisticRegression()

tr.fit(SS_train, y_train)
len(tr.coef_[0])

print(cross_val_score(lr, SS_train, y_train))
print(f"Train Accuracy Score: {lr.score(SS_train, y_train)}")
print()

# Print Train Scores
print(cross_val_score(lr, SS_test, y_test))
print(f"Test Accuracy Score: {lr.score(SS_test, y_test)}")

[0.77479893 0.76290214 0.76621418 0.76604659 0.77140942]
Train Accuracy Score: 0.7614036263699434

[0.75829146 0.75515334 0.76721971 0.74912016 0.77677225]
Test Accuracy Score: 0.7598029358536095


### SVC model



.

In [12]:
#%%time
#svc = SVC()
#svc.fit(SS_train, y_train)

#print(cross_val_score(svc, SS_train, y_train))
#print(f"Train Accuracy Score: {svc.score(SS_train, y_train)}")
#print()

## Print Train Scores
#print(cross_val_score(svc, SS_test, y_test))
#print(f"Test Accuracy Score: {svc.score(SS_test, y_test)}")

In [13]:


#nu_mod = {}
#nu_mod['transformer'] = 'cvec'
#for key, value in gs.best_params_.items():
#    nu_mod[key] = value
#    nu_mod['train'] = round(gs.score(X_train,y_train),3)
#    nu_mod['test'] = round(gs.score(X_test,y_test),3)
#    
#    nu_df = pd.DataFrame(nu_mod)
#    nu_df
#    
#nu_df = pd.DataFrame(nu_mod)
#nu_df
#stored_models = pd.concat([stored_models,nu_df])
#stored_models