### Finding Distribution of stock returns

#### Libraries

In [143]:
import numpy as np
import pandas as pd
import math
import random
from yahoo_finance_api2 import share
from yahoo_finance_api2.exceptions import YahooFinanceError
from datetime import date

#### Getting stock Data

In [145]:
#Gets stock price of any S&P500 stock from start_date to end_date
def get_stock_prices(company_symbol, start_date, end_date):

    my_share = share.Share(company_symbol)
    symbol_data = pd.DataFrame()

    try:
        symbol_data = my_share.get_historical(share.PERIOD_TYPE_DAY,
                                              5,
                                              share.FREQUENCY_TYPE_MINUTE,
                                              60)
    except YahooFinanceError as e:
        print(e.message)
        sys.exit(1)
    stock_prices= pd.DataFrame(symbol_data)

    return stock_prices

#Returns the difference between opening and closing price of stock
def get_price_movements(stock_prices):
    price_change = stock_prices['open'] - stock_prices['close']
    #display(price_change)
    #price_movement = np.array(list(map((lambda x: 1 if x>0 else 0), price_change)))
    return price_change


In [150]:

#df = pd.read_csv('Data.csv')
df=get_stock_prices('SPY', date(2019,1,1), date(2019,1,10))
df['Return'] = get_price_movements(df)
df.tail()

Unnamed: 0,timestamp,open,high,low,close,volume,Return
64,1576832400000,319.7,319.7,319.38,319.41,0,0.29
65,1576836000000,319.41,319.41,319.33,319.35,0,0.06
66,1576839600000,319.35,319.7,319.35,319.58,0,-0.23
67,1576843200000,319.57,319.61,319.46,319.56,0,0.01
68,1576846800000,319.58,319.83,319.38,319.79,0,-0.21


In [152]:
# k =number of clusters
k =3
# intial_params contains all the intial parameters for the Guassians
initial_params = pd.DataFrame()
#initalizing all means to random value in df['Return']
#random_error = random.sample(range(0, 60), k)
#initial_params['means'] = pd.Series([df['Val'].mean() for i in range(k)])
initial_params['means'] = np.random.choice(df['Return'], k)
#initals all stds to random values

initial_params['std'] =np.random.random_sample(size=k)
#intialialize all weights to be equal
#initial_weights =[0.1,0.6,0.3]
initial_params['weights'] = pd.Series([1/k for i in range(k)])
initial_params


Unnamed: 0,means,std,weights
0,0.0095,0.743585,0.333333
1,-0.01,0.609162,0.333333
2,0.01,0.267929,0.333333


In [153]:

from scipy.stats import norm


def Guassian_prob(x,mean,std):
    #returns pdf of x is normally distributed
    return norm.pdf(x,mean, std)

def Expectation_Maximization(df, initial_params,k, max_iterations):
    for t in range(max_iterations):
        
        if(t%10==0):
            print("this is ",t," iteration")
            display(initial_params)
        #Expectation_Step

        #name is just so that the dataframe has an appropriate name scheme
        name = 'Gamma_'
        #Gammas contains all probs
        Gammas = pd.DataFrame()
        temp_list =[]
        for i in range(k):
            #Iterate over number of Clusters
            for vals in df['Return']:
                #Iterate over all vals
                temp_list.append(initial_params['weights'][i]*Guassian_prob(vals,initial_params['means'][i],initial_params['std'][i]))
            #compute likelihoods and store in Gamma
            Gammas[name+str(i+1)] =pd.Series(temp_list)
            temp_list=[]

        #Normalizes all columns
        Gammas= Gammas.div(Gammas.sum(axis=1), axis=0)

        #Maximization Step



        pi_k = []
        mu_k = []
        std_k =[]
        #Sum=[]
        for columns in Gammas.columns:
            #total equals total values in dataset
            total =Gammas.count()[0]
            #Sum is the row wise sum of all probs
            Sum = Gammas[columns].sum(axis=0)
            #Using formula for pi_k
           
            pi_k.append(Sum/total)
            #multplying value and prob together
            temp = [val*prob for val,prob in zip(df['Return'],Gammas[columns])]
            #using formula of Means
            mu_k.append(sum(temp)/Sum)
        #display(mu_k)
        i =0
        #seperate loop for stds
        for columns in Gammas.columns:
            #temp has all deviations from mean squared multiplied by the probability of that event
            temp = [prob*(val-mu_k[i])**2 for val,prob in zip(df['Return'],Gammas[columns])]
            Sum = Gammas[columns].sum(axis=0)
            #display(sum(temp))
            #Formula for stds_k
            std_k.append(sum(temp)/Sum)
            i+=1

        initial_params['weights'] =pd.Series(pi_k)
        initial_params['means'] =pd.Series(mu_k)
        initial_params['std'] =pd.Series(np.sqrt(std_k))
        Gammas= pd.DataFrame()

    
        
        
    
            

        
       
        
        
    
    
    return initial_params
    
    
l = Expectation_Maximization(df,initial_params,k,100)
l

this is  0  iteration


Unnamed: 0,means,std,weights
0,0.0095,0.743585,0.333333
1,-0.01,0.609162,0.333333
2,0.01,0.267929,0.333333


this is  10  iteration


Unnamed: 0,means,std,weights
0,-0.071475,0.261427,0.229527
1,-0.048086,0.246246,0.272043
2,-0.002592,0.180184,0.49843


this is  20  iteration


Unnamed: 0,means,std,weights
0,-0.102128,0.270821,0.223348
1,-0.031898,0.229892,0.265217
2,0.00096,0.183238,0.511436


this is  30  iteration


Unnamed: 0,means,std,weights
0,-0.132497,0.274008,0.215076
1,-0.009639,0.20965,0.264342
2,0.000511,0.189035,0.520582


this is  40  iteration


Unnamed: 0,means,std,weights
0,-0.148524,0.272202,0.204365
1,0.000746,0.201368,0.26759
2,-0.001184,0.192994,0.528045


this is  50  iteration


Unnamed: 0,means,std,weights
0,-0.158223,0.27173,0.193268
1,0.004911,0.199604,0.271354
2,-0.002862,0.19443,0.535378


this is  60  iteration


Unnamed: 0,means,std,weights
0,-0.166126,0.272119,0.182606
1,0.007618,0.199999,0.274998
2,-0.00468,0.194793,0.542397


this is  70  iteration


Unnamed: 0,means,std,weights
0,-0.173636,0.27273,0.172624
1,0.010504,0.20109,0.278447
2,-0.006795,0.194703,0.548929


this is  80  iteration


Unnamed: 0,means,std,weights
0,-0.181174,0.273307,0.163403
1,0.014116,0.202396,0.281718
2,-0.009284,0.194338,0.554879


this is  90  iteration


Unnamed: 0,means,std,weights
0,-0.188874,0.273749,0.154971
1,0.018667,0.203707,0.28487
2,-0.012187,0.193735,0.560159


Unnamed: 0,means,std,weights
0,-0.196786,0.274001,0.14734
1,0.024276,0.204876,0.288005
2,-0.015542,0.192883,0.564654
