# Assignment 10 Analysis and forecast of Stock exchange data

## Loading and Inspecting data

In [3]:
import pandas as pd
import numpy as np
import random
from scipy import stats

In [4]:
df = pd.read_csv("INDO.csv")
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-12-19,11.000,11.960,10.50,10.700,10.700,89400
1,2019-12-20,10.808,12.490,9.25,9.650,9.650,503000
2,2019-12-23,8.790,8.790,7.25,7.810,7.810,117400
3,2019-12-24,7.500,7.640,6.00,6.410,6.410,102800
4,2019-12-26,6.420,7.720,6.42,7.410,7.410,78400
...,...,...,...,...,...,...,...
66,2020-03-26,4.160,4.340,3.01,3.530,3.530,9700
67,2020-03-27,3.530,3.640,3.53,3.640,3.640,4200
68,2020-03-30,3.990,3.990,3.53,3.530,3.530,10300
69,2020-03-31,3.623,3.623,3.53,3.601,3.601,3100


#### Inspecting first few rows of dataset

In [6]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-12-19,11.0,11.96,10.5,10.7,10.7,89400
1,2019-12-20,10.808,12.49,9.25,9.65,9.65,503000
2,2019-12-23,8.79,8.79,7.25,7.81,7.81,117400
3,2019-12-24,7.5,7.64,6.0,6.41,6.41,102800
4,2019-12-26,6.42,7.72,6.42,7.41,7.41,78400


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       71 non-null     object 
 1   Open       71 non-null     float64
 2   High       71 non-null     float64
 3   Low        71 non-null     float64
 4   Close      71 non-null     float64
 5   Adj Close  71 non-null     float64
 6   Volume     71 non-null     int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 4.0+ KB


#### Setting Date column as index and formatting it appropriately

In [9]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-19,11.000,11.960,10.50,10.700,10.700,89400
2019-12-20,10.808,12.490,9.25,9.650,9.650,503000
2019-12-23,8.790,8.790,7.25,7.810,7.810,117400
2019-12-24,7.500,7.640,6.00,6.410,6.410,102800
2019-12-26,6.420,7.720,6.42,7.410,7.410,78400
...,...,...,...,...,...,...
2020-03-26,4.160,4.340,3.01,3.530,3.530,9700
2020-03-27,3.530,3.640,3.53,3.640,3.640,4200
2020-03-30,3.990,3.990,3.53,3.530,3.530,10300
2020-03-31,3.623,3.623,3.53,3.601,3.601,3100


## Data Clenaing

#### Checking missing value and manageing it appropriately


In [12]:
df.isna()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-19,False,False,False,False,False,False
2019-12-20,False,False,False,False,False,False
2019-12-23,False,False,False,False,False,False
2019-12-24,False,False,False,False,False,False
2019-12-26,False,False,False,False,False,False
...,...,...,...,...,...,...
2020-03-26,False,False,False,False,False,False
2020-03-27,False,False,False,False,False,False
2020-03-30,False,False,False,False,False,False
2020-03-31,False,False,False,False,False,False


In [13]:
df.isna().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

#### Manageing the anamolies and outliars

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 71 entries, 2019-12-19 to 2020-04-01
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       71 non-null     float64
 1   High       71 non-null     float64
 2   Low        71 non-null     float64
 3   Close      71 non-null     float64
 4   Adj Close  71 non-null     float64
 5   Volume     71 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 3.9 KB


In [16]:
def col_clean(data_frame, col):
    q1 = data_frame[col].quantile(0.25)
    q3 = data_frame[col].quantile(0.75)
    iqr = q3 - q1
    lb, ub = (q1 - 1.5 * iqr), (q3 + 1.5 * iqr)
    data_frame = data_frame[(df[col] < ub) & (df[col] > lb)]
    return data_frame

for col in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']:
    df = col_clean(df, col)

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 59 entries, 2019-12-27 to 2020-04-01
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       59 non-null     float64
 1   High       59 non-null     float64
 2   Low        59 non-null     float64
 3   Close      59 non-null     float64
 4   Adj Close  59 non-null     float64
 5   Volume     59 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 3.2 KB


## Analyze the data

#### Resample the data to weekly and montly frequencies to analyze stock price patterns.

In [47]:
weekly_resampled_df = df.resample('W').agg({
    'Open':'first',
    'High':'max',
    'Low':'min',
    'Close':'last',
    'Adj Close': 'last',
    'Volume':'sum'
})

weekly_resampled_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-29,7.65,7.75,7.08,7.56,7.56,30500
2020-01-05,7.25,7.42,7.0,7.09,7.09,48800
2020-01-12,7.1,7.99,6.81,7.95,7.95,50200
2020-01-19,7.65,8.69,7.56,8.3,8.3,57200
2020-01-26,8.29,8.46,7.7,7.8,7.8,25500
2020-02-02,7.78,8.3,6.52,6.9,6.9,49900
2020-02-09,6.87,7.25,6.5,6.5,6.5,10200
2020-02-16,6.5,7.5,6.25,7.34,7.34,15200
2020-02-23,7.35,7.35,6.5,6.638,6.638,14800
2020-03-01,6.638,6.77,5.3,5.3,5.3,12300


In [51]:
monthly_resampled_df = df.resample(pd.offsets.MonthEnd()).agg({
    'Open':'first',
    'High':'max',
    'Low':'min',
    'Close':'last',
    'Adj Close': 'last',
    'Volume':'sum'
})
monthly_resampled_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-31,7.65,7.75,7.08,7.11,7.11,43800
2020-01-31,7.03,8.69,6.52,6.9,6.9,218300
2020-02-29,6.87,7.5,5.3,5.3,5.3,52500
2020-03-31,5.25,5.4,1.55,3.601,3.601,104600
2020-04-30,3.53,3.546,3.53,3.546,3.546,400


## Splitting Data into training and testing set

In [58]:
data_frame_len = len(df)
divider = (data_frame_len * 4) // 5
training_set = df.iloc[: divider, :]
test_set = df.iloc[divider + 1: , :]

In [66]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11 entries, 2020-03-16 to 2020-04-01
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       11 non-null     float64
 1   High       11 non-null     float64
 2   Low        11 non-null     float64
 3   Close      11 non-null     float64
 4   Adj Close  11 non-null     float64
 5   Volume     11 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 616.0 bytes


In [68]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 47 entries, 2019-12-27 to 2020-03-12
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       47 non-null     float64
 1   High       47 non-null     float64
 2   Low        47 non-null     float64
 3   Close      47 non-null     float64
 4   Adj Close  47 non-null     float64
 5   Volume     47 non-null     int64  
dtypes: float64(5), int64(1)
memory usage: 2.6 KB
