# Importing Libraries 

In [1]:
import pandas as pd
import numpy as np
import sys

### Importing Data as pandas dataframe and preprocessing it

In [2]:
df = pd.read_csv("AMZN 2010-2020.csv",error_bad_lines=False)

In [3]:
df = df.drop('High',axis=1)
df = df.drop('Low',axis=1)
df = df.drop('Adj Close',axis=1)
df = df.drop('Volume',axis=1)

In [4]:
df.describe()

Unnamed: 0,Open,Close
count,2517.0,2517.0
mean,755.984549,755.999992
std,633.737848,633.711304
min,105.93,108.610001
25%,253.899994,253.369995
50%,430.070007,429.920013
75%,1141.0,1152.349976
max,2500.0,2497.939941


In [5]:
df.head()

Unnamed: 0,Date,Open,Close
0,2010-06-01,124.970001,123.239998
1,2010-06-02,124.019997,126.309998
2,2010-06-03,126.25,128.759995
3,2010-06-04,126.330002,122.769997
4,2010-06-07,125.839996,122.010002


### Setting up the parameters for the model 

In [6]:
df['Difference'] = df['Close'] - df['Open']

In [7]:
df['Change Mod'] = df['Difference'].diff(-1).abs() / df['Open']

In [8]:
df.head()

Unnamed: 0,Date,Open,Close,Difference,Change Mod
0,2010-06-01,124.970001,123.239998,-1.730003,0.032168
1,2010-06-02,124.019997,126.309998,2.290001,0.001774
2,2010-06-03,126.25,128.759995,2.509995,0.048079
3,2010-06-04,126.330002,122.769997,-3.560005,0.002137
4,2010-06-07,125.839996,122.010002,-3.829994,0.005324


In [9]:
df.describe()

Unnamed: 0,Open,Close,Difference,Change Mod
count,2517.0,2517.0,2517.0,2516.0
mean,755.984549,755.999992,0.015443,0.016134
std,633.737848,633.711304,15.161938,0.014638
min,105.93,108.610001,-121.119995,2.7e-05
25%,253.899994,253.369995,-3.679993,0.005592
50%,430.070007,429.920013,0.030029,0.012107
75%,1141.0,1152.349976,4.059997,0.022352
max,2500.0,2497.939941,128.870117,0.120835


### Setting up the range  

In [10]:
criteria = [df['Difference'].between(-122, -df['Change Mod']*df['Open']/2), df['Difference'].between( -df['Change Mod']*df['Open']/2, df['Change Mod']*df['Open']/2), df['Difference'].between(df['Change Mod']*df['Open']/2,128)]
values = ['Down', 'Stagnant', 'Up']

df['Movement'] = np.select(criteria, values, 'Stagnant')

In [11]:
df.head()

Unnamed: 0,Date,Open,Close,Difference,Change Mod,Movement
0,2010-06-01,124.970001,123.239998,-1.730003,0.032168,Stagnant
1,2010-06-02,124.019997,126.309998,2.290001,0.001774,Up
2,2010-06-03,126.25,128.759995,2.509995,0.048079,Stagnant
3,2010-06-04,126.330002,122.769997,-3.560005,0.002137,Down
4,2010-06-07,125.839996,122.010002,-3.829994,0.005324,Down


### Showing number of States  

In [12]:
df['Movement'].value_counts()

Stagnant    892
Up          847
Down        778
Name: Movement, dtype: int64

In [13]:
z = list(df['Movement'])

### Transition Probability Matrix 

In [14]:
pd.crosstab(pd.Series(z[1:],name='Tomorrow'),
            pd.Series(z[:-1],name='Today'),normalize=0)

Today,Down,Stagnant,Up
Tomorrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Down,0.352185,0.42545,0.222365
Stagnant,0.392817,0.205387,0.401796
Up,0.181818,0.4451,0.373081


In [15]:
from itertools import islice

def window(seq, n=2):
    "Sliding window width n from seq.  From old itertools recipes."""
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [16]:
import pandas as pd

pairs = pd.DataFrame(window(z), columns=['Today', 'Tomorrow'])
counts = pairs.groupby('Today')['Tomorrow'].value_counts()
probs = (counts).unstack()

### Transition Count Matrix

In [17]:
probs

Tomorrow,Down,Stagnant,Up
Today,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Down,274,350,154
Stagnant,331,183,377
Up,173,358,316


# End