# Stock Visualisation

In [1]:
import pandas as pd 
import numpy as np 
#import sqlite3 as sql
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm_notebook

# VISUALIZATIONS
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.express as px

# PCA & CLUSTERING
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
RANDOM_STATE = 17 # Fixed random state for reproductivity 

#### Retrieve data from database
- pickle preserve multidimensional compenent unlike SQLite

In [2]:
" Open pickle file "
df = pickle.load(open("stock_index.pkl",'rb')) 

In [3]:
df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,gsector,X1_REVGH,X2_EPS,X3_ROA,X4_ROE,X5_PE,X6_PS,X7_NPM,X8_GPM,X9_OM,...,X13_EM,X14_EVCFO,X15_LTDTA,X16_WCR,X17_DE,X18_QR,X19_DSI,X20_DPO,y_return,Date
Ticker,YQuarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,1999Q3,35,0.038308,0.36,0.026733,0.041925,119.270833,7.81804,0.064686,0.558218,0.117873,...,-32.860525,127.695785,0.0,2.219496,0.568323,1.130879,632.218004,168.644252,0.932748,1999-07-31
A,1999Q4,35,0.172976,0.39,0.026819,0.04317,279.807692,16.939338,0.059641,0.517157,0.083742,...,-32.860525,127.695785,0.0,2.1047,0.609698,1.212968,462.889171,157.48731,-0.402035,1999-10-31
A,2000Q1,35,-0.243873,0.2,0.018433,0.029202,365.0,17.82604,0.070773,0.141457,0.134454,...,-32.860525,127.695785,0.0,2.410256,0.584262,1.652153,321.930397,101.025014,-0.168378,2000-01-31


#### Scale Data 

In [5]:
# Scale features only 
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(df.reset_index().drop(['Date','gsector','Ticker', 'YQuarter'],axis=1)))
scaled_data.columns = df.reset_index().drop(['Date','gsector','Ticker', 'YQuarter'],axis=1).columns
print(scaled_data.shape)
scaled_data.head(2)

# Create new DF and append original df and scaled df
df_scaled = df.reset_index()[['Date','gsector','Ticker', 'YQuarter']]
print(df_scaled.shape)
df_scaled.head(1)

# Re-create multi-index structure
df_scaled = pd.concat([df_scaled,scaled_data],axis=1)
df_scaled.set_index(['Ticker', 'YQuarter'], inplace=True)

print(df_scaled.shape)
df_scaled.head(3)

(79231, 21)
(79231, 4)
(79231, 23)


Unnamed: 0_level_0,Unnamed: 1_level_0,Date,gsector,X1_REVGH,X2_EPS,X3_ROA,X4_ROE,X5_PE,X6_PS,X7_NPM,X8_GPM,...,X12_CR,X13_EM,X14_EVCFO,X15_LTDTA,X16_WCR,X17_DE,X18_QR,X19_DSI,X20_DPO,y_return
Ticker,YQuarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,1999Q3,1999-07-31,35,-0.022931,-0.023659,0.347816,0.004063,0.109792,-0.004181,0.005723,0.044621,...,-0.092836,0.001886,0.010982,-1.204901,0.087653,-0.051386,-0.215562,0.077065,-0.071047,4.254025
A,1999Q4,1999-10-31,35,-0.022841,-0.022619,0.349842,0.004127,0.564879,-0.003706,0.005663,0.037921,...,-0.092836,0.001886,0.010982,-1.204901,0.031215,-0.050471,-0.174395,0.013787,-0.071265,-1.950252
A,2000Q1,2000-01-31,35,-0.02312,-0.029203,0.151998,0.003404,0.806381,-0.00366,0.005797,-0.023384,...,-0.011322,0.001886,0.010982,-1.204901,0.181438,-0.051034,0.045849,-0.038888,-0.072367,-0.864181


#### PCA 

In [10]:
pca = PCA(n_components=0.9) 
pca.fit(df_scaled.drop(['Date','gsector'],axis=1)) 
df_pca= pd.DataFrame(pca.transform(df_scaled.drop(['Date','gsector'],axis=1))) 
#let's check the shape of X_pca array
print ("shape of df_pca", df_pca.shape) # sample

shape of df_pca (79231, 14)
