# Importing Libraries 

In [1]:
import pandas as pd
import numpy as np
import sys

### Importing Data as pandas dataframe and preprocessing it

In [2]:
df = pd.read_csv("AMZN 2010-2020.csv",error_bad_lines=False)

In [3]:
df = df.drop('High',axis=1)
df = df.drop('Low',axis=1)
df = df.drop('Adj Close',axis=1)
df = df.drop('Volume',axis=1)

In [4]:
df.describe()

Unnamed: 0,Open,Close
count,2517.0,2517.0
mean,755.984549,755.999992
std,633.737848,633.711304
min,105.93,108.610001
25%,253.899994,253.369995
50%,430.070007,429.920013
75%,1141.0,1152.349976
max,2500.0,2497.939941


In [5]:
df.head()

Unnamed: 0,Date,Open,Close
0,2010-06-01,124.970001,123.239998
1,2010-06-02,124.019997,126.309998
2,2010-06-03,126.25,128.759995
3,2010-06-04,126.330002,122.769997
4,2010-06-07,125.839996,122.010002


### Setting up the parameters for the model 

In [6]:
df['Difference'] = df['Close'].diff(-1)

In [7]:
df['Change Mod'] = df['Difference'].diff(-1).abs() / df['Open']

In [8]:
df.head()

Unnamed: 0,Date,Open,Close,Difference,Change Mod
0,2010-06-01,124.970001,123.239998,-3.07,0.004961
1,2010-06-02,124.019997,126.309998,-2.449997,0.068054
2,2010-06-03,126.25,128.759995,5.989998,0.041426
3,2010-06-04,126.330002,122.769997,0.759995,0.019077
4,2010-06-07,125.839996,122.010002,3.170006,0.0178


In [9]:
df.describe()

Unnamed: 0,Open,Close,Difference,Change Mod
count,2517.0,2517.0,2516.0,2515.0
mean,755.984549,755.999992,-0.921753,0.01975634
std,633.737848,633.711304,19.249531,0.02011001
min,105.93,108.610001,-138.039917,7.060282e-08
25%,253.899994,253.369995,-5.349975,0.006755031
50%,430.070007,429.920013,-0.5,0.01416592
75%,1141.0,1152.349976,3.575003,0.02626529
max,2500.0,2497.939941,187.959961,0.1666971


### Setting up the range  

In [10]:
criteria = [df['Difference'].between(-139, -df['Change Mod']*df['Open']/2), df['Difference'].between( -df['Change Mod']*df['Open']/2, df['Change Mod']*df['Open']/2), df['Difference'].between(df['Change Mod']*df['Open']/2,188)]
values = ['Down', 'Stagnant', 'Up']

df['Movement'] = np.select(criteria, values, 'Stagnant')

In [11]:
df.head()

Unnamed: 0,Date,Open,Close,Difference,Change Mod,Movement
0,2010-06-01,124.970001,123.239998,-3.07,0.004961,Down
1,2010-06-02,124.019997,126.309998,-2.449997,0.068054,Stagnant
2,2010-06-03,126.25,128.759995,5.989998,0.041426,Up
3,2010-06-04,126.330002,122.769997,0.759995,0.019077,Stagnant
4,2010-06-07,125.839996,122.010002,3.170006,0.0178,Up


### Showing number of States  

In [12]:
df['Movement'].value_counts()

Stagnant    913
Down        893
Up          711
Name: Movement, dtype: int64

In [13]:
z = list(df['Movement'])

### Transition Probability Matrix 

In [14]:
pd.crosstab(pd.Series(z[1:],name='Tomorrow'),
            pd.Series(z[:-1],name='Today'),normalize=0)

Today,Down,Stagnant,Up
Tomorrow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Down,0.380045,0.455157,0.164798
Stagnant,0.43483,0.193866,0.371303
Up,0.220816,0.462729,0.316456


In [15]:
from itertools import islice

def window(seq, n=2):
    "Sliding window width n from seq.  From old itertools recipes."""
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [16]:
import pandas as pd

pairs = pd.DataFrame(window(z), columns=['Today', 'Tomorrow'])
counts = pairs.groupby('Today')['Tomorrow'].value_counts()
probs = (counts).unstack()

### Transition Count Matrix

In [17]:
probs

Tomorrow,Down,Stagnant,Up
Today,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Down,339,397,157
Stagnant,406,177,329
Up,147,339,225


# End