<a href="https://colab.research.google.com/github/Anish-S-tech/my-ml-journey/blob/main/Time_Series_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Time series analysis - information collected in sequence over time. It shows how things change at different points, like stock prices every day or temperature every hour.

In [None]:
# Step 1 - Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller

In [None]:
# Step 2: Loading the data

df = pd.read_csv('/content/stock_data.csv',parse_dates=True,index_col="Date")
df.head()

In [None]:
# Step 3: Cleaning the data

df.drop(columns="Unnamed: 0",inplace=True) # Removes the Null valued row
df.head()

In [None]:
# Step 4: Plotting high stock prices

sns.set(style="whitegrid")

plt.figure(figsize=(12,6))
sns.lineplot(data=df,x='Date',y='High',label='High Price',color='blue')

plt.xlabel('Date')
plt.ylabel('High')
plt.title('Share price over time')

plt.show()

In [None]:
# Step 5: Resampling the data

df_resampled = df.resample('ME').mean(numeric_only = True)  # Groups the data by month(M) and the data at end of month(E) is month's value (ME)

sns.set(style = "whitegrid")

plt.figure(figsize=(12,6))
sns.lineplot(data=df_resampled,x="Date",y="High",label="Month wise Average high price",color='blue')

plt.xlabel('Date (Monthly)')
plt.ylabel('High')
plt.title('Monthly resampling highest price over time')

plt.show()


In [None]:
# Step 6: Detecting seasonality with autocorrelation

if 'Date' not in df.columns:
  print("Date is already the index or not present in the dataframe")

else:
  df.set_index("Date",inplace=True)

plt.figure(figsize=(12,6))
plot_acf(df['High'],lags=40)

plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.title('Autocorrelation function (ACF) plot')
plt.show()

In [None]:
# Step 7: Detecting stationarity with ADF test (ADF = Augmented Dickey Fuller test)

from statsmodels.tsa.stattools import adfuller

result = adfuller(df['High'])
print("ADF statistic:", result[0])
print('p-value:',result[1])
print('Critical values:', result[4])


In [None]:
# Step 8: Differencing to achieve stationarity

df['high_diff'] = df['High'].diff()  # df['High'].diff(): helps in calculating the difference between consecutive values in the High column.

plt.figure(figsize=(12,6))
plt.plot(df['High'],label='Original high',color='blue')
plt.plot(df['high_diff'],label='Differenced high',linestyle='--',color='green')
plt.legend()
plt.title("Original vs Differenced high")
plt.show()

In [None]:
# Step 9: Smoothing data with moving average

window_size=120
df['High_diff'] = df['High'].rolling(window = window_size).mean()

plt.figure(figsize=(12,6))

plt.plot(df['High'],label="Original high",color='blue')
plt.plot(df['High_diff'],label=f"Moving average (Window size:{window_size})",linestyle='--',color='green')

plt.legend()
plt.title('Original vs Moving average of high')
plt.show()

In [None]:
# Step 10: Original vs Differenced data (combining)

df_combined = pd.concat([df['High'],df['high_diff']],axis=1) # First value always null because no other values before to subtract with the current value

display(df_combined)

In [None]:
# Getting the differenced value alone

df.dropna(subset=['high_diff'],inplace=True)
df['high_diff'].head()

In [None]:
# Perform ADF test on differenced data i.e, high_diff

from statsmodels.tsa.stattools import adfuller

result = adfuller(df['high_diff'])
print('ADF statistic:',result[0])
print('p-value:',result[1])
print('Critical values:',result[4])

In [None]:
# Since p-value < ADF test's value, let's reject the null hypothesis and conclude that we don't have enough evidence to
# accept the null hypothesis