In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from skopt.space import Integer, Real, Categorical
from skopt import BayesSearchCV
from scipy.stats import uniform, loguniform

In [2]:
def metrics(tn, fp, fn, tp, metric = ['accuracy']):
    answers = {}
    
    if 'accuracy' in metric or 'all' in metric:
        answers['accuracy'] = (tp + tn) / (tn + fn + fp + tp)
    if 'sensitivity' in metric  or 'all' in metric:
        answers['sensitivity'] = tp / (tp + fn)
    if 'specificity' in metric  or 'all' in metric:
        answers['specificity'] = tn / (tn + fp)
    if 'f1' in metric or 'all' in metric:
        answers['f1'] = tp / (tp + .5*(fp + fn))

    return answers

In [3]:
df = pd.read_csv('data/cleaned_cmv&unpop_data')

In [4]:
df.drop(columns = ['Unnamed: 0'], inplace = True)

***Data Engineering***

In [5]:
df['subreddit'].value_counts(normalize = True)

unpopularopinion    0.563229
changemyview        0.436771
Name: subreddit, dtype: float64

- Set changemyview to be the positive class.

In [6]:
df['subreddit'] = df['subreddit'].map(lambda x: 1 if x == 'changemyview' else 0)

- Add sentiment analysis columns.

In [7]:
corpus = list(df['selftext'])

In [8]:
sia = SentimentIntensityAnalyzer()

In [9]:
sentiment = []    

for tweet in corpus:
    scores = sia.polarity_scores(tweet)
    sentiment.append(scores)

sents = pd.DataFrame(sentiment)

In [10]:
df = pd.concat([df, sents], axis = 1)

- Change column names so that there won't be a collision with vectorized columns.

In [11]:
df.rename(columns = {'author': 'author_username', 'id': 'author_id', 'selftext': 'post_text', 'score': 'post_score', 'subreddit': 'post_subreddit', 'title': 'post_title', 'neg': 'neg_sentiment', 'pos': 'pos_sentiment', 'neu': 'neu_sentiment', 'compound': 'comp_sentiment'}, inplace = True)

- Add training column for fitting purposes.

In [12]:
df['training_set'] = 0

In [13]:
x_list = df.columns.tolist()
x_list.remove('post_subreddit')
X = df[x_list]
y = df['post_subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [14]:
X_train['training_set'] = X_train['training_set'].map(lambda x: 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
train_total = pd.concat([X_train, y_train], axis = 1)
test_total = pd.concat([X_test, y_test], axis = 1)

- Identify best parameters for vectorizer. Code credit to lesson 'advanced hyperparameter search'

In [18]:
pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = 'english')),
    ('mnb', MultinomialNB())
])

params = {
    'cvec__max_features': Integer(5000, 10000),
    'cvec__min_df': Integer(1, 5),
    'cvec__max_df': Real(.75,1, prior='uniform')
    #'cvec__ngram_range': [(1,1), (1,2)]
}

cvec_bs = BayesSearchCV(estimator = pipe,
                     search_spaces = params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     cv = 5,
                     random_state=42)

In [19]:
cvec_bs.fit(X_train['post_text'], y_train)

BayesSearchCV(cv=5,
              estimator=Pipeline(steps=[('cvec',
                                         CountVectorizer(stop_words='english')),
                                        ('mnb', MultinomialNB())]),
              random_state=42, scoring='f1_weighted',
              search_spaces={'cvec__max_df': Real(low=0.75, high=1, prior='uniform', transform='identity'),
                             'cvec__max_features': Integer(low=5000, high=10000, prior='uniform', transform='identity'),
                             'cvec__min_df': Integer(low=1, high=5, prior='uniform', transform='identity')})

In [20]:
preds = cvec_bs.predict(X_test['post_text'])

In [21]:
cvec_bs.best_estimator_

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.7591845371145763, max_features=9916,
                                 min_df=2, stop_words='english')),
                ('mnb', MultinomialNB())])

In [22]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [23]:
metrics(tn, fp, fn, tp, metric = 'all')

{'accuracy': 0.7783115991763898,
 'sensitivity': 0.6802827965435978,
 'specificity': 0.8543570993296771,
 'f1': 0.7283431455004206}

- Vectorize words and add to dataframe.

In [24]:
cvec = cvec_bs.best_estimator_['cvec']

In [26]:
Xcv_train = cvec.fit_transform(train_total['post_text'])
Xcv_test = cvec.transform(test_total['post_text'])

In [27]:
df_xcv_train = pd.DataFrame(Xcv_train.todense(), columns = cvec.get_feature_names())
df_xcv_test = pd.DataFrame(Xcv_test.todense(), columns = cvec.get_feature_names())

In [28]:
df_xcv_train.reset_index(inplace = True)
df_xcv_test.reset_index(inplace = True)
train_total.reset_index(inplace = True)
test_total.reset_index(inplace = True)

In [29]:
train_with_vecs = pd.concat([train_total, df_xcv_train], axis = 1)
test_with_vecs = pd.concat([test_total, df_xcv_test], axis = 1)

In [30]:
df = pd.concat([train_with_vecs, test_with_vecs], ignore_index = True)

In [31]:
df.reset_index(drop = True, inplace = True)

In [32]:
df.drop(columns = 'index', inplace = True)

- Save engineered file.

In [39]:
df.to_csv('data/engineered_cmv&unpop_data')

- On to part 4 ->