In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re

from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from nltk.tokenize import RegexpTokenizer

from skopt.space import Integer, Real, Categorical
from skopt import BayesSearchCV
from scipy.stats import uniform, loguniform

In [2]:
def metrics(tn, fp, fn, tp, metric = ['accuracy']):
    answers = {}
    
    if 'accuracy' in metric or 'all' in metric:
        answers['accuracy'] = (tp + tn) / (tn + fn + fp + tp)
    if 'sensitivity' in metric  or 'all' in metric:
        answers['sensitivity'] = tp / (tp + fn)
    if 'specificity' in metric  or 'all' in metric:
        answers['specificity'] = tn / (tn + fp)
    if 'f1' in metric or 'all' in metric:
        answers['f1'] = tp / (tp + .5*(fp + fn))

    return answers

In [3]:
df = pd.read_csv('data/cleaned_cmv&unpop_data')

In [4]:
df.drop(columns = ['Unnamed: 0'], inplace = True)

***Data Engineering***

In [5]:
df['subreddit'].value_counts(normalize = True)

unpopularopinion    0.563229
changemyview        0.436771
Name: subreddit, dtype: float64

- Set changemyview to be the positive class.

In [6]:
df['subreddit'] = df['subreddit'].map(lambda x: 1 if x == 'changemyview' else 0)

- Scrub URLs and CMVs.

In [7]:
def scrub_text(string):
    remove_cmv_url = r'^([Cc]{1}[Mm]{1}[Vv]{1}[:]{0,1})|([Ww]{3}[^\s]+)|([^\s]+[\.]{1}[Cc]{1}[Oo]{1}[Mm]{1})$'
    return re.sub(remove_cmv_url, '', string)

In [8]:
df['title'] = df['title'].map(scrub_text)
df['selftext'] = df['selftext'].map(scrub_text)

- Remove inserted moderator comments. Many change my view posts have "_____ gt hello user of cmv this is a footnote...[etc.]" or similar appended to the end of their selftext. 

In [9]:
def scrub_mod_comment(string):
    return string.split('_____ gt')[0].split('gt')[0]

In [10]:
df['selftext'] = df['selftext'].map(scrub_mod_comment)

- Lemmatize text fields.

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
tokenizer = RegexpTokenizer('\w+')

In [13]:
def lemma_map(in_string):  
    to_lemma = tokenizer.tokenize(in_string)
    return " ".join([lemmatizer.lemmatize(token.lower()) for token in to_lemma])

In [14]:
df['title'] = df['title'].map(lemma_map)
df['selftext'] = df['selftext'].map(lemma_map)

- Add sentiment analysis columns.

In [15]:
corpus = zip(list(df['title']), list(df['selftext']))

In [16]:
sia = SentimentIntensityAnalyzer()

In [17]:
sentiment = []    

for title, tweet in corpus:
    scores = sia.polarity_scores(title + " " + tweet)
    sentiment.append(scores)

sents = pd.DataFrame(sentiment)

In [18]:
df = pd.concat([df, sents], axis = 1)

- Change column names so that there won't be a collision with vectorized columns.

In [19]:
df.rename(columns = {'author': 'author_username', 'id': 'author_id', 'selftext': 'post_text', 'score': 'post_score', 'subreddit': 'post_subreddit', 'title': 'post_title', 'neg': 'neg_sentiment', 'pos': 'pos_sentiment', 'neu': 'neu_sentiment', 'compound': 'comp_sentiment'}, inplace = True)

- Add training column for fitting purposes.

In [20]:
x_list = df.columns.tolist()
x_list.remove('post_subreddit')
X = df[x_list]
y = df['post_subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [21]:
train_total = pd.concat([X_train, y_train], axis = 1)
test_total = pd.concat([X_test, y_test], axis = 1)

- Identify best parameters for vectorizer. Code credit to lesson 'advanced hyperparameter search'
- Multinomial Naive Bayes is used as a fast, easily optimized model that works well on natural language data to pipe our vectorizer through.

In [22]:
pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words = 'english')),
    ('mnb', MultinomialNB())
])

params = {
    'cvec__max_features': Integer(100, 10000),
    'cvec__min_df': Integer(1, 5),
    'cvec__max_df': Real(.5,1, prior='uniform'),
    'mnb__alpha': Real(.001, 100, prior='log-uniform')
}

cvec_bs = BayesSearchCV(estimator = pipe,
                     search_spaces = params,
                     scoring = 'f1',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     refit = True,
                     random_state=42)

In [23]:
train_text_post = X_train['post_title'] + " " + X_train['post_text']
test_text_post = X_test['post_title'] + " " + X_test['post_text']

In [24]:
cvec_bs.fit(train_text_post, y_train)

BayesSearchCV(cv=5,
              estimator=Pipeline(steps=[('cvec',
                                         CountVectorizer(stop_words='english')),
                                        ('mnb', MultinomialNB())]),
              n_jobs=8, random_state=42, scoring='f1',
              search_spaces={'cvec__max_df': Real(low=0.5, high=1, prior='uniform', transform='identity'),
                             'cvec__max_features': Integer(low=100, high=10000, prior='uniform', transform='identity'),
                             'cvec__min_df': Integer(low=1, high=5, prior='uniform', transform='identity'),
                             'mnb__alpha': Real(low=0.001, high=100, prior='log-uniform', transform='identity')})

In [25]:
cvec_bs.best_params_

OrderedDict([('cvec__max_df', 0.8468466496240046),
             ('cvec__max_features', 1063),
             ('cvec__min_df', 1),
             ('mnb__alpha', 0.001)])

- Train performance

In [26]:
preds = cvec_bs.predict(train_text_post)

In [27]:
tn, fp, fn, tp = confusion_matrix(y_train, preds).ravel()

In [28]:
metrics(tn, fp, fn, tp, metric = 'all')

{'accuracy': 0.7966140471288035,
 'sensitivity': 0.8017286537454165,
 'specificity': 0.7926482534524777,
 'f1': 0.7749367088607595}

- Test performance

In [27]:
preds = cvec_bs.predict(test_text_post)

In [29]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [30]:
metrics(tn, fp, fn, tp, metric = 'all')

{'accuracy': 0.7848318462594372,
 'sensitivity': 0.7894736842105263,
 'specificity': 0.7812309567336989,
 'f1': 0.7622298065984073}

- Vectorize words and add to dataframe.

In [31]:
cvec = cvec_bs.best_estimator_['cvec']

In [32]:
Xcv_train = cvec.fit_transform(train_total['post_title'] + " " + train_total['post_text'])
Xcv_test = cvec.transform(test_total['post_title'] + " " + test_total['post_text'])

In [33]:
df_xcv_train = pd.DataFrame(Xcv_train.todense(), columns = cvec.get_feature_names())
df_xcv_test = pd.DataFrame(Xcv_test.todense(), columns = cvec.get_feature_names())

In [34]:
df_xcv_train.reset_index(inplace = True)
df_xcv_test.reset_index(inplace = True)
train_total.reset_index(inplace = True)
test_total.reset_index(inplace = True)

In [35]:
train_with_vecs = pd.concat([train_total, df_xcv_train], axis = 1)
test_with_vecs = pd.concat([test_total, df_xcv_test], axis = 1)

In [36]:
df = pd.concat([train_with_vecs, test_with_vecs], ignore_index = True)

In [37]:
df.reset_index(drop = True, inplace = True)

In [38]:
df.drop(columns = 'index', inplace = True)

- Save engineered file.

In [39]:
df.to_csv('data/engineered_cmv&unpop_data')

- On to part 4 ->