## Setup

In [35]:
import numpy as np
import pandas as pd
import torch
import nltk
import zipfile
import fasttext
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from pathlib import Path
from InferSent.models import InferSent
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.dummy import DummyClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('stopwords')
set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [2]:
# clone or pull InferSent
if not Path("InferSent").is_dir():
    !git clone https://github.com/facebookresearch/InferSent.git
else:
    %cd InferSent
    !git pull
    %cd ..

# download and preprocess SNLI/MultiNLI datasets for fastText
if not Path("InferSent/dataset/fastText").exists():
    %cd Infersent/dataset
    %mkdir fastText
    %cd ..
    %cd ..
    
if not Path("InferSent/dataset/fastText/crawl-300d-2M.vec.zip").is_file():
    !curl -Lo dataset/fastText/crawl-300d-2M.vec.zip https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip

if not Path("InferSent/encoder/infersent2.pkl").is_file():
    !curl -Lo InferSent/encoder/infersent2.pkl https://s3.amazonaws.com/senteval/infersent/infersent2.pkl

D:\System\Gamenomicron\AI\MLND\project\InferSent
Updating 8aaaf40..940c003
D:\System\Gamenomicron\AI\MLND\project


error: Your local changes to the following files would be overwritten by merge:
	models.py
Please commit your changes or stash them before you merge.
Aborting


In [3]:
if not Path("InferSent/dataset/fastText/crawl-300d-2M.vec/crawl-300d-2M.vec").is_file():
    zipfile.ZipFile("InferSent/dataset/fastText/crawl-300d-2M.vec.zip").extractall(".").close()

In [4]:
# Load pretrained InferSent model

infersent_version = 2
INFERSENT_PATH = "InferSent/encoder/infersent%s.pkl" % infersent_version
infersent_params = {
    'bsize': 64,
    'word_emb_dim': 300,
    'enc_lstm_dim': 2048,
    'pool_type': 'max',
    'dpout_model': 0.0,
    'version': infersent_version
}
infersent = InferSent(infersent_params)
infersent.load_state_dict(torch.load(INFERSENT_PATH))
# infersent.cuda()

In [5]:
# setting the word embeddings path to fastText
infersent.set_w2v_path("InferSent/dataset/fastText/crawl-300d-2M.vec/crawl-300d-2M.vec")

In [6]:
# Load embeddings of K most frequent words

infersent.build_vocab_k_words(K=100000)

Vocab size : 100000


## Loading the data

In [7]:
raw_data = pd.read_csv('data/Combined_News_DJIA.csv')
raw_data.columns = raw_data.columns.str.lower()
raw_data.head()

Unnamed: 0,date,label,top1,top2,top3,top4,top5,top6,top7,top8,...,top16,top17,top18,top19,top20,top21,top22,top23,top24,top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [8]:
raw_data_dropped = raw_data.dropna()

In [9]:
# stop word removal function
def generate_stop_word_free_dataset(dataframe):
    df = dataframe.copy()
    # iterate over every column
    for column in df.columns:
        # convert column values into an array where each row is an element in the array
        sentences = df[column].tolist()
        # remove stop words
        if isinstance(sentences[0], str): # TODO: doesn't seem to be working properly
            stop_wordless_sentences = []
            for sentence in sentences:
                split_sentence = sentence.split()
                new_words = [word for word in split_sentence if word not in stopwords.words('english')]
                # reconstruct the sentence
                new_sentence = " ".join(new_words)
                stop_wordless_sentences.append(new_sentence)
            # recolumnize into dataframe
            df[column] = stop_wordless_sentences
    return(df)

In [10]:
for column in raw_data_dropped.columns:
    if "top" in column:
        raw_data_dropped[column] = raw_data_dropped[column].str.lower()

# creating our bearish holdout set for later testing
raw_data_dropped = raw_data_dropped.set_index(['date']) # sets index to the date column so we can split by date ranges
bearish_raw_data = raw_data_dropped.loc['2008-08-08':'2009-03-31']
bullish_raw_data = raw_data_dropped.loc['2009-04-01':'2016-07-01']

# resets all the indexes just because I feel like it
raw_data_dropped = raw_data_dropped.reset_index()
bearish_raw_data = bearish_raw_data.reset_index()
bullish_raw_data = bullish_raw_data.reset_index()
bullish_raw_data_labels = bullish_raw_data.pop('label')
bearish_raw_data_labels = bearish_raw_data.pop('label')
# pull date out so as to not learn historical patterns based on sequence
bullish_raw_data_dates = bullish_raw_data.pop('date')
bearish_raw_data_dates = bearish_raw_data.pop('date')

# generate stop wordless datasets
bullish_stop_words_free_data = generate_stop_word_free_dataset(bullish_raw_data)
bearish_stop_words_free_data = generate_stop_word_free_dataset(bearish_raw_data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



## Visualizations

### Bullish Set Word Frequencies

In [11]:
headline_list= []
for column in bullish_stop_words_free_data.columns:
    if "top" in column:
        sentences = bullish_stop_words_free_data[column].tolist()
        headline_list += sentences
        
headline_list = pd.DataFrame(headline_list)
bullish_headlines_no_stopwords = headline_list[0].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = bullish_headlines_no_stopwords.index.values[2:25],
            y = bullish_headlines_no_stopwords.values[2:25],
            marker= dict(colorscale='Jet',color=bullish_headlines_no_stopwords.values[2:100]),
            text='Word counts')]

layout = go.Layout(title='Top 25 Bullish')
figure = go.Figure(data=data, layout=layout)
py.iplot(figure, filename='bullish-freq')

### Bearish Set Word Frequencies

In [12]:
headline_list= []
for column in bearish_stop_words_free_data.columns:
    if "top" in column:
        sentences = bearish_stop_words_free_data[column].tolist()
        headline_list += sentences
        
headline_list = pd.DataFrame(headline_list)
bearish_headlines_no_stopwords = headline_list[0].str.split(expand=True).unstack().value_counts()
data = [go.Bar(
            x = bearish_headlines_no_stopwords.index.values[2:25],
            y = bearish_headlines_no_stopwords.values[2:25],
            marker= dict(colorscale='Jet',color=bearish_headlines_no_stopwords.values[2:100]),
            text='Word counts')]

layout = go.Layout(title='Top 25 Bearish')
figure = go.Figure(data=data, layout=layout)
py.iplot(figure, filename='bearish-freq')

## Preprocessing

In [13]:
# def numpy_fillna(data):
#     # Get lengths of each row of data
#     lens = np.array([len(i) for i in data])

#     # Mask of valid places in each row
#     mask = np.arange(lens.max()) < lens[:,None]

#     # Setup output array and put elements from data into masked positions
#     out = np.zeros(mask.shape, dtype=data.dtype)
#     out[mask] = np.concatenate(data)
#     return out

In [14]:
def split_data(x_data, y_data, test_percent_size):
    return train_test_split(x_data, y_data, test_size=test_percent_size, random_state=42)

In [15]:
def convert_sentences_to_vector_mean(dataframe):
    df = dataframe.copy()
    # iterate over every column
    for column in dataframe.columns:
    #     with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #         print(df[column])
        # convert column values into a list where each row is an element in the list
        sentences = dataframe[column].tolist()
        sentence_vectors = infersent.encode(sentences)
    #     sentence_vectors_filled = numpy_fillna(sentence_vectors) # fill jagged array, just in case
        sentence_vector_means = np.mean(sentence_vectors, axis=1)
        df[column] = sentence_vector_means
    return df

# Bullish Dataset

### Splitting

In [16]:
# to avoid introducing unseen information from the test set into the training
# splitting data before preprocessing
bullish_x_train, bullish_x_test, bullish_y_train, bullish_y_test = split_data(bullish_raw_data, bullish_raw_data_labels, 0.2)
bullish_sw_free_x_train, bullish_sw_free_x_test, bullish_sw_free_y_train, bullish_sw_free_y_test = split_data(bullish_stop_words_free_data, bullish_raw_data_labels, 0.2)

### Vectorizing

In [17]:
bullish_x_train_encoded = convert_sentences_to_vector_mean(bullish_x_train)
bullish_x_test_encoded = convert_sentences_to_vector_mean(bullish_x_test)

bullish_sw_free_x_train_encoded = convert_sentences_to_vector_mean(bullish_sw_free_x_train)
bullish_sw_free_x_test_encoded = convert_sentences_to_vector_mean(bullish_sw_free_x_test)

### Scaling

In [42]:
with_stopwords_scaler = StandardScaler()
bullish_x_train_scaled = with_stopwords_scaler.fit_transform(bullish_x_train_encoded)
bullish_x_test_scaled = with_stopwords_scaler.transform(bullish_x_test_encoded)

without_stopwords_scaler = StandardScaler()
bullish_sw_free_x_train_scaled = without_stopwords_scaler.fit_transform(bullish_sw_free_x_train_encoded)
bullish_sw_free_x_test_scaled = without_stopwords_scaler.transform(bullish_sw_free_x_test_encoded)

## Training

In [50]:
bullish_model = SVC(gamma='auto')
bullish_model.fit(bullish_x_train_scaled, bullish_y_train)

bullish_sw_free_model = SVC(gamma='auto')
bullish_sw_free_model.fit(bullish_sw_free_x_train_scaled, bullish_sw_free_y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
bullish_y_pred = bullish_model.predict(bullish_x_test_scaled)
bullish_sw_free_y_pred = bullish_sw_free_model.predict(bullish_sw_free_x_test_scaled)

## Metrics

In [52]:
def calculate_metrics(y_true, y_pred):
    # accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # precision
    precision = precision_score(y_true, y_pred)

    # recall
    recall = recall_score(y_true, y_pred)

    # f1
    f1 = f1_score(y_true, y_pred)

    # confusion matrix
    # confusion_matrix = confusion_matrix(y_true, y_pred)

    trace = go.Table(cells=dict(values=[["Accuracy", "Precision", "Recall", "F1"],
                                        [accuracy, precision, recall, f1]]))

    data = [trace] 
    return py.iplot(data, filename = 'metrics_table')

## NLP Model Results

### With Stopwords

In [53]:
calculate_metrics(bullish_y_test, bullish_y_pred)

### Without Stopwords

In [54]:
calculate_metrics(bullish_sw_free_y_test, bullish_sw_free_y_pred)

## Benchmark Models

In [24]:
benchmark_model_stratified = DummyClassifier(strategy='stratified', random_state=42)
benchmark_model_stratified.fit(bullish_x_train_scaled, bullish_y_train)

benchmark_model_constant = DummyClassifier(strategy='constant', random_state=42, constant=1)
benchmark_model_constant.fit(bullish_x_train_scaled, bullish_y_train)

benchmark_model_stratified = DummyClassifier(strategy='stratified', random_state=42)
benchmark_model_stratified.fit(bullish_sw_free_x_train_scaled, bullish_sw_free_y_train)

benchmark_model_constant = DummyClassifier(strategy='constant', random_state=42, constant=1)
benchmark_model_constant.fit(bullish_sw_free_x_train_scaled, bullish_sw_free_y_train)

DummyClassifier(constant=1, random_state=42, strategy='constant')

In [25]:
stratified_y_pred = benchmark_model_stratified.predict(bullish_x_test_scaled)
stratified_sw_free_y_pred = benchmark_model_stratified.predict(bullish_sw_free_x_test_scaled)

In [26]:
constant_y_pred = benchmark_model_constant.predict(bullish_x_test_scaled)
constant_sw_free_y_pred = benchmark_model_constant.predict(bullish_sw_free_x_test_scaled)

### Benchmark Results With Stopwords

In [27]:
calculate_metrics(bullish_y_test, stratified_y_pred)

In [28]:
calculate_metrics(bullish_y_test, constant_y_pred)

### Benchmark Results Without Stopwords

In [29]:
calculate_metrics(bullish_sw_free_y_test, stratified_sw_free_y_pred)

In [30]:
calculate_metrics(bullish_sw_free_y_test, constant_sw_free_y_pred)

### Class Distribution Visualization

In [31]:
trace = go.Bar(x=["Positive or Neutral", "Negative"],
               y=[np.count_nonzero(bullish_y_train==1),
                  len(bullish_y_train)-np.count_nonzero(bullish_y_train==1)])
data = [trace] 
py.iplot(data, filename='distribution_chart')

### Hyper-parameter Tuning

In [49]:
parameter_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 
                   'C': [1, 10, 100, 1000]},
                  {'kernel': ['linear'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                  {'kernel': ['poly'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                 {'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]

grid_search = GridSearchCV(bullish_sw_free_model,
                   parameter_grid, cv=5, scoring='precision', return_train_score=True)
    #roc_auc, 'recall', precision
grid_search.fit(bullish_sw_free_x_test_scaled, bullish_sw_free_y_test)
print(grid_search.best_estimator_)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


# Generalization to Bearish Holdout Set

## Preprocessing

In [61]:
# bearish_x_train, bearish_x_test, bearish_y_train, bearish_y_test =  split_data(bearish_stop_words_free_data,
#                                                                                bearish_raw_data_labels,
#                                                                                0.2)
bearish_encoded = convert_sentences_to_vector_mean(bearish_stop_words_free_data)

In [62]:
bearish_scaled = without_stopwords_scaler.transform(bearish_encoded)

In [63]:
bearish_y_pred = bullish_sw_free_model.predict(bearish_scaled)

In [64]:
calculate_metrics(bearish_raw_data_labels, bearish_y_pred)