In [59]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.base import TransformerMixin
from nltk.corpus import stopwords
from scripts import remove_links, find_hashtags

In [2]:
df_train = pd.read_csv('./data/train_nolinks.csv')

In [3]:
df_train.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'text_nolinks',
       'text_nl_hashtag', 'text_nl_ht_keyword'],
      dtype='object')

In [4]:
X = df_train['text_nl_hashtag']
y = df_train['target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [6]:
# Taken from Noah C.'s Week 5 Lab Review
# originally adapted from:
# https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        return X.toarray()
    

In [7]:
params_nb = {
    'cv__lowercase': [True, False],
    'cv__stop_words': [None, 'english'],
    'cv__max_features': [None, 8000],
    'cv__ngram_range': [(1,1),(1,2)],
    'cv__min_df': [1, 3, 5],
    'cv__max_df': [.8],
    'tf__use_idf': [False]
}

In [8]:
pipe_nb = Pipeline([('cv', CountVectorizer()),
                    ('tf', TfidfTransformer()),
                    ('nb', MultinomialNB())
                   ])

In [9]:
params_lr = {
    'cv__lowercase': [True],
    'cv__stop_words': [None, 'english'],
    'cv__max_features': [None],
    'cv__ngram_range': [(1,1),(1,2)],
    'cv__min_df': [1],
    'cv__max_df': [.6, .80, .9],
    'tf__use_idf': [True, False]
}

In [10]:
pipe_lr = Pipeline([('cv', CountVectorizer()),
                    ('tf', TfidfTransformer()),
                    ('lr', LogisticRegression())
                   ])

## What is the baseline score?

In [11]:
y_train.value_counts(normalize=True)

0    0.570328
1    0.429672
Name: target, dtype: float64

Our baseline score is **approximately 57%**

In [12]:
# Code adapted from Patrick Wales-Dinan's demonstration on saving GridSearches
class GridSearchContainer:
    
    def __init__(self, X, y):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y,
                                                                                random_state=42,
                                                                                stratify = y)
        self.model_params = {}
        self.best_models = []
        self.model_df = pd.DataFrame
        self.count = 0
        
    def search(self, estimator, params, mod_name='model', evaluator='f1'):
            gs = GridSearchCV(estimator,
                  param_grid = params,
                  verbose = 2,
                  cv = 5,
                  n_jobs=-1,
                  scoring = evaluator)
            
            gs.fit(self.X_train, self.y_train)
            print(f"Train {evaluator}: {gs.score(self.X_train, self.y_train)}")
            print(f"Test {evaluator}: {gs.score(self.X_test, self.y_test)}")
            gs.best_params_[evaluator] = gs.best_score_
            
            self.model_params[f'{mod_name}_{self.count}'] = gs.best_params_
            self.model_df = pd.DataFrame.from_dict(self.model_params, orient='index')
            self.model_df.sort_values(by=evaluator, ascending=False, inplace=True)
            self.best_models.append((gs.best_estimator_, gs.best_score_))
            self.count+=1
            
    

In [13]:
gsc = GridSearchContainer(X, y)

In [14]:
gsc.search(pipe_nb, params_nb, 'nb_f1', 'f1')
# gsc.search(pipe_lr, params_lr, 'lr_f1', 'f1')

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    6.1s finished


Train f1: 0.8159181858603823
Test f1: 0.7480211081794195


In [15]:
gsc.model_df

Unnamed: 0,cv__lowercase,cv__max_df,cv__max_features,cv__min_df,cv__ngram_range,cv__stop_words,tf__use_idf,f1
nb_f1_0,True,0.8,,3,"(1, 1)",english,False,0.730895


In [16]:
gsc.best_models[-1][0]

Pipeline(steps=[('cv',
                 CountVectorizer(max_df=0.8, min_df=3, stop_words='english')),
                ('tf', TfidfTransformer(use_idf=False)),
                ('nb', MultinomialNB())])

In [17]:
best_nb = gsc.best_models[-1][0]

In [18]:
best_nb.fit(X_test, y_test)

Pipeline(steps=[('cv',
                 CountVectorizer(max_df=0.8, min_df=3, stop_words='english')),
                ('tf', TfidfTransformer(use_idf=False)),
                ('nb', MultinomialNB())])

### Creating Probabilities Dataframe

Now I want to create a dataframe that has the probabilities of an observation (post) being classified as belonging to the `poppunkers` or `punk` subreddits

In [19]:
def proba_df(model):
    '''
    Pass your best model through this function. Returns a dataframe containing:
    - orig_post: the original post
    - 0: probability of the post being predicted as false
    - 1: probability of the post being predicted as true
    - target: true value
    - pred: the predicted value
    '''
    model.fit(X_test, y_test)
    probabilities = model.predict_proba(X_test)
    proba_df = pd.DataFrame(probabilities,
                           columns=model.classes_, # Getting class names
                           index=X_test.index # Setting original index of X_train
                           )

    proba_df['orig_post'] = X_test
    proba_df['target'] = y_test
    proba_df['pred'] = model.predict(X_test)
    
    return proba_df

In [20]:
proba_df = proba_df(best_nb)

In [21]:
proba_df

Unnamed: 0,0,1,orig_post,target,pred
6509,0.874051,0.125949,if i survive tonight. i wouldn't change one th...,0,0
3768,0.772953,0.227047,i wanna set some shit on fire.,1,0
5507,0.778543,0.221457,reddit's new content policy goes into effect m...,1,0
5116,0.498482,0.501518,check out this awesome profile on #ge's swimmi...,0,1
18,0.575074,0.424926,my car is so fast,0,0
...,...,...,...,...,...
1280,0.858455,0.141545,it hurts for me to eat cause i burned my toung...,0,0
7206,0.902538,0.097462,so yeah splatoon is still lots of fun and defa...,0,0
12,0.213216,0.786784,#raining #flooding #florida #tampabay #tampa 1...,1,1
4078,0.127511,0.872489,rnk issues severe thunderstorm warning [wind: ...,1,1


In [22]:
wrong_preds_1 = proba_df[(proba_df['target'] != proba_df['pred']) & (proba_df['target'] == 1)]

In [23]:
abs(wrong_preds_1[0] - wrong_preds_1[1]).describe()

count    205.000000
mean       0.284105
std        0.211113
min        0.003462
25%        0.103602
50%        0.240797
75%        0.446237
max        0.832493
dtype: float64

In [35]:
cvec = CountVectorizer(stop_words='english', max_df=.8, min_df=3)
X_test_cvec = cvec.fit_transform(X_test)

In [36]:
len(cvec.get_feature_names())

1409

In [37]:
# Summing the columns in the X_test_cvec array, thanks John Vinyard from
# Stack Overflow: https://stackoverflow.com/questions/13567345/how-to-calculate-the-sum-of-all-columns-of-a-2d-numpy-array-efficiently
word_freq = X_test_cvec.toarray().sum(axis=0)

In [38]:
# Creating a DataFrame for word importance
word_importance = pd.DataFrame(np.exp(best_nb.named_steps['nb'].coef_.T), index=cvec.get_feature_names())
word_importance.columns = ['coefficient']
word_importance['testing_word_freq'] = word_freq

# Let's sort this by the Coefficient
word_importance_sorted = word_importance.sort_values(by='coefficient', ascending=False)

In [40]:
# Saving this for later
word_importance_sorted.to_csv('./data/word_importance.csv')

### Creating a submission for Kaggle

In [57]:
# Training on the full training set
best_nb.fit(X, y)

Pipeline(steps=[('cv',
                 CountVectorizer(max_df=0.8, min_df=3, stop_words='english')),
                ('tf', TfidfTransformer(use_idf=False)),
                ('nb', MultinomialNB())])

In [58]:
df_test = pd.read_csv('./data/test.csv')

In [61]:
df_test['text_nolinks'] = df_test['text'].apply(remove_links)

In [62]:
df_test['text_nl_hashtag'] = (df_test['text_nolinks'].apply(find_hashtags) + ' ') * 8

In [63]:
X_testing = df_test['text_nl_hashtag']

In [69]:
y_pred = best_nb.predict(X_testing)

In [66]:
submission = pd.DataFrame(df_test['id'])

In [70]:
submission['target'] = y_pred

In [72]:
submission.to_csv('./data/submission_1.csv', index=False)