### Advanced ML prediction trades on internet compagny shares
- data importation
- data processing
- simple regressor model
- advanced optimization

In [None]:
import pandas as pd
from pandas_datareader import data
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

### Data importation

In [None]:
start_date = '2012-08-18'
end_date = '2019-12-31'

google_data = data.get_data_yahoo('GOOGL', start_date, end_date)
facebook_data = data.get_data_yahoo('FB', start_date, end_date)
alibaba_data = data.get_data_yahoo('BABA', start_date, end_date)
baidu_data = data.get_data_yahoo('BIDU', start_date, end_date)
sap_data = data.get_data_yahoo('SAP', start_date, end_date)
saleforce_data = data.get_data_yahoo('CRM', start_date, end_date)
vmware_data = data.get_data_yahoo('VMW', start_date, end_date)
adobe_data = data.get_data_yahoo('ADBE', start_date, end_date)
intuit_data = data.get_data_yahoo('INTU', start_date, end_date)
twitter_data = data.get_data_yahoo('TWTR', start_date, end_date)
paypal_data = data.get_data_yahoo('PYPL', start_date, end_date)



### Simple data processing 

In [None]:
data_df = pd.DataFrame({"google_returns": google_data["Close"].shift(1) - google_data["Close"], "facebook_returns": facebook_data["Close"].shift(1) - facebook_data["Close"], "alibaba_returns": alibaba_data["Close"].shift(1) - alibaba_data["Close"], "baidu_returns": baidu_data["Close"].shift(1) - baidu_data["Close"], "sap_returns": sap_data["Close"].shift(1) - sap_data["Close"], "saleforce_returns": saleforce_data["Close"].shift(1) - saleforce_data["Close"], "vmware_returns": vmware_data["Close"].shift(1) - vmware_data["Close"], "adobe_returns": adobe_data["Close"].shift(1) - adobe_data["Close"], "intuit_returns": intuit_data["Close"].shift(1) - intuit_data["Close"], "twitter_returns": twitter_data["Close"].shift(1) - twitter_data["Close"], "paypal_returns": paypal_data["Close"].shift(1) - paypal_data["Close"]})
data_df.dropna(axis=0, inplace=True)
data_df.head()

In [None]:
# plotting datas with matplotlib
data_df.cumsum().plot(figsize=(20, 20))

In [None]:
# correlation visualisation with seaborn heatmap
import seaborn as sns
sns.heatmap(data_df.corr())

In [None]:
# set features as X and target as y with default features for this time
X = data_df.drop(["facebook_returns"], axis=1)
y = data_df["facebook_returns"]


In [None]:
# for split the datetime dataset, I use 2016 datas as train set and 2017 data as test set
X_train, X_test, y_train, y_test = X["2016"], X["2017"], y["2016"], y["2017"]

### Simple regressor model

In [None]:
# simple ElasticNet model with default values
from sklearn.linear_model import ElasticNet

model = ElasticNet()
model.fit(X_train, y_train)
model.score(X_train, y_train), model.score(X_test, y_test)

In [None]:
# create a Pandas DataFrame with target test set as y_test and model prediction of test features set
comparison = pd.DataFrame({"y_test": y_test, "prediction": model.predict(X_test)})

In [None]:
comparison.plot(figsize=(20, 20))

In [None]:
# Verify if the model is overfitted or underfitted with learning curve
from sklearn.model_selection import learning_curve

N, train_score, val_score = learning_curve(model, X_train, y_train, train_sizes=np.linspace(0.2, 1.0, 5), cv=5)
plt.plot(N, train_score.mean(axis=1), figsize=(20, 20))
plt.plot(N, val_score.mean(axis=1))


### Advanced model optimization

In [None]:
# I use GridSearchCV to cross validates differents parameters
from sklearn.model_selection import GridSearchCV

param_grid = {"alpha": np.arange(0.1, 0.9, 0.1),
              "l1_ratio": np.arange(0.1, 1., 0.1),
              "tol": np.arange(0.00005, 0.0005, 0.00005)
              }

grid = GridSearchCV(ElasticNet(), param_grid, cv=5)

In [None]:
grid.fit(X_train, y_train)

In [None]:
# Best parameters got by GridSearchCV
grid.best_params_

In [None]:
grid.score(X_train, y_train), grid.score(X_test, y_test)

In [None]:
# Again I use learning curve to verify overfit or underfit
N, train_score, val_score = learning_curve(grid, X_train, y_train, train_sizes=np.linspace(0.2, 1.0, 5), cv=5)
plt.plot(N, train_score.mean(axis=1), figsize=(20, 20))
plt.plot(N, val_score.mean(axis=1))