# Data Preperation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
import time

#For Modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


%matplotlib inline

#### Maximize DataFrame Display Columns

In [4]:
#Remove the max column setting in pandas

pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Load .CSV

In [5]:
funny_df = pd.read_csv('./datasets/raw_funny.csv')
climate_df = pd.read_csv('./datasets/raw_climate.csv')

In [6]:
bernie_df = pd.read_csv('./datasets/raw_bernie.csv')
butti_df = pd.read_csv('./datasets/raw_butti.csv')
kamala_df = pd.read_csv('./datasets/raw_kamala.csv')
warren_df = pd.read_csv('./datasets/raw_warren.csv')

### Prepare for EDA

In [7]:
#Create modified DataFrames for the test Subreddits
new_bernie_df = bernie_df[['title', 'selftext']]
new_butti_df = butti_df[['title', 'selftext']]
new_kamala_df = kamala_df[['title', 'selftext']]
new_warren_df = warren_df[['title', 'selftext']]

#Create a new 'label' column that will be mapped to our target (y) variable
new_bernie_df['label'] = 'bernie'
new_butti_df['label'] = 'buttigieg'
new_kamala_df['label'] = 'kamala'
new_warren_df['label'] = 'warren'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.

In [8]:
#Combine dataframes into a single list for concatenation

list_of_dfs = [new_bernie_df, new_butti_df, new_kamala_df, new_warren_df]

In [26]:
for df in list_of_dfs:
    df = df['selftext'].fillna(' ')

In [27]:
#Combine the Bernie and Buttigieg DataFrames

df = pd.concat(list_of_dfs, ignore_index=True)

In [28]:
#Create a combined text column, 'all_text'
df['all_text'] = df['title'] + ' ' + df['selftext']

#Drop 'title' and 'body text' columns, leaving only the combined column
#df = df.drop(['selftext'], axis = 1)

#### Baseline Accuracy

In [31]:
df['label'].value_counts(normalize = True)

kamala       0.264316
bernie       0.262460
buttigieg    0.261930
warren       0.211294
Name: label, dtype: float64

In [33]:
df.head()

Unnamed: 0,title,selftext,label,all_text
0,Canvassing update! Where is everyone at now?,,bernie,Canvassing update! Where is everyone at now?
1,Colorized picture of Bernie Sanders being arre...,,bernie,Colorized picture of Bernie Sanders being arre...
2,Bernie Sanders had the best quote at the debate,,bernie,Bernie Sanders had the best quote at the debate
3,Bernie: DC is flooded. It was 90 degrees in Al...,,bernie,Bernie: DC is flooded. It was 90 degrees in Al...
4,Sanders and Ocasio-Cortez move to declare clim...,,bernie,Sanders and Ocasio-Cortez move to declare clim...


In [40]:
df.isnull().sum()

title       0
selftext    0
label       0
all_text    0
dtype: int64

In [41]:
df.to_csv('./datasets/reddit_politic.csv', index = False)

## Instantiate Features and Target Column

In [47]:
X = df['title']

In [57]:
y = df['label'].map({'bernie':0, 'buttigieg':1, 'kamala': 2, 'warren': 3})

## Import SpaCy and Instantiate

In [58]:
import spacy

In [59]:
nlp = spacy.load('en_core_web_sm')

### Import Spacy Stopwords and Custom Stopwords

In [60]:
#import SpaCy stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

print(f'There are: {len(spacy_stopwords)} built-in stop words')


There are: 326 built-in stop words


In [61]:
#Create custom list of stop words
reddit_stopwords = set()

reddit_stopwords.update(['Buttigieg','Pete','Bernie','Bern','Sanders','Mayor','Elizabeth', 'Warren', 'Kamala', 'Harris'])


print(f'There are: {len(reddit_stopwords)} built-in + custom stop words')

There are: 10 built-in + custom stop words


## Train Test Split

In [62]:
y.isnull().sum()

0

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y)

In [71]:
type(X_train)

pandas.core.series.Series

### Back to Spacy

In [73]:
#Create a train and test dataframes that includes both X and Y

In [75]:
#Instantiate DataFrames
train_df = pd.DataFrame()
test_df = pd.DataFrame()

train_df['text'] = X_train
train_df['cat'] = y_train

test_df['text'] = X_test
test_df['cat'] = y_test

#### Set up train dataframe tuples

In [78]:
train_df['tuples'] = train_df.apply(
    lambda row: (row['text'],row['cat']), axis=1)
train = train_df['tuples'].tolist()
train[:1]

[('Can anyone help me find the interview where Pete describes his first date with Chasten?',
  1)]

#### Set up train dataframe tuples

In [80]:
test_df['tuples'] = test_df.apply(
    lambda row: (row['text'],row['cat']), axis=1)
test = test_df['tuples'].tolist()
test[:1]

[('Pete speaking after being denied entry to a child detention center in Florida this morning. "No more child prisons!"',
  1)]

### Spacy Documentation

In [81]:
#functions from spacy documentation
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from","t" , int)
n_texts=30000
#You can increase texts count if you have more computational power.

#("Number of training iterations", "n", int))
n_iter=10

In [82]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

Loading food reviews data...
Using 30000 examples (2112 training, 528 evaluation)
