# Prediction of Tesla's stock prices based on different predictions algorithms

This practice is inspired by the challange presented in https://www.youtube.com/watch?v=JuLCL3wCEAk by Siraj Raval 
and "Hands on Machine Learning with Scikit-Learn and Tensorflow" by Aurelien Geron

The main idea is to train a different machine learning models on Tesla stock's historical data as well as the sentiment from news headlines and other data sources.



In [5]:
import numpy as np
import pandas as pd
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
from sklearn.metrics import accuracy_score

In [None]:
# Headlines Sentiment Analysis of NY Time's headlines about Tesla
df = pd.read_pickle('./data/TSLA_stock_information.pkl')
df

In [None]:
pd.concat([df,pd.DataFrame(columns=['compund','neg', 'neu','pos'])])
df

In [None]:
sentiment_analyzer = SentimentAnalyzer()

for day in df.index:
    sentiment = None
    headlines = df.loc[day,'headlines'].split('/_/')
    for headline in headlines
        headline = unicodedata.normalize('NFKD', headline).encode('ascii','ignore')
        new_sentiment = sentiment_analyzer.polarity_scores(sentence)
        if sentiment is None:
            sentiment = new_sentiment
        else:
             sentiment = { sentiment[key] + new_sentiment[key] for key in sentiment.keys()}
    qty = len(headlines)
    for key in sentiment.keys()
        df.set_value(day, key, sentiment[key]/qty)
        

In [None]:
df.drop('headlines', axis=1, inplace=True)
#df['close'] = df['close'].apply(np.int64)

In [None]:
df.to_pickle('./data/TSLA_sentiment_information_.pkl')

In [None]:
# Splits sets

def add_lastday_label(X, y):
    X_size = X.size
    new_set = np.zeros(X_size[0],X_size[1] +1)
    new_set[:, :-1] = X
    new_set[1:, -1] = y[:,:-1]
    return new_set[1:, :]

def split(dates, df, X_attr, y_attr):
    splits = []
    for start_date, end_date in dates:
        parcial_df = df.ix[start_date : end_date]
        X_set = parcial_df.as_matrix(columns = X_attr)
        y_set = parcial_df.as_matrix(columns = y_attr)
        splits.append([X_set, y_set])
    return splits

In [None]:



'''
train = df.ix[train_start_date : train_end_date]
val = df.ix[val_start_date : val_end_date]
test = df.ix[test_start_date:test_end_date]

X_train = train.as_matrix(columns=['neg', 'neu','pos'])
y_train = train.as_matrix(columns=['close'])

X_val = val.as_matrix(columns=['neg', 'neu','pos'])
y_val = val.as_matrix(columns=['close'])

X_test = test.as_matrix(columns=['neg', 'neu','pos'])
y_test = test.as_matrix(columns=['close'])

#adding last day close
X_train = add_lastday_price(X_train)
X_test = add_lastday_price(X_test)
'''


In [None]:
## First Approach, test differents models with default hiperparameters in order to identy the best one

In [None]:
## First Model: Random Forest

from sklearn.ensemble import RandomForestRegressor

train_start_date = '20100629'
train_end_date = '20150629'

#val_start_date = '20150629'
#val_end_date = '20160629'

test_start_date = '20160630'
test_end_date = '20170911'

split_set = split([[train_start_date, train_end_date], [test_start_date,test_end_date]],
                  df, ['neg', 'neu','pos'], ['close'] )

X_train, y_train = split[0]
X_test, y_test = split[1]

X_train = add_lastday_price(X_train)
X_test = add_lastday_price(X_test)

# Random forest without parametrizing -> risk of overfitting

random_forest = RandomForestRegressor(random_state=42)
random_forest.fit(X_tarin, y_train)
y_pred = random_forest.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))

# Gradient Boost regressson with early stopping -> TODO

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

df_pred = pd.DataFrame(data=y_pred[0:], index = pd.date_range(test_start_date, test_end_date), columns=['predicted_close_price'])
df_y_test = df.ix[test_start_date:test_end_date]['close']
df_y_test.rename(columns={'close': 'actual_close_price'})


def plot_stock_dataframes(predictions_df, df_y_test):
    df_pred_plot = predictions_df.plot()
    df_pred_plot.set_xlabel('Dates')
    df_pred_plot.set_ylabel('Close Prices')
    fig = df_y_test.plot(ax = df_pred_plot).get_figure()
    
plot_stock_dataframes(predictions_df, df_y_test)


In [3]:
## Second Model: Regresor NN

In [4]:
## Third Model: Recurrent NN

In [None]:
## Fourth Modek: DeepMind's WaveNet