In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load Dataset
df = pd.read_csv('data_YesBank_StockPrices.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

### Data Cleaning

In [None]:
# Dataset Duplicate Value Count
df.duplicated().sum()

In [None]:
# Missing Values/Null Values Count
df.isna().sum()

In [None]:
# Visualizing the missing values
plt.figure(figsize=(6, 3))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Visualizing Missing Values Using Heatmap')
plt.show()

In [None]:
# Dataset Columns
df.columns

In [None]:
# Dataset Describe
df.describe()

In [None]:
# Check Unique Values for each variable.
for col in df.columns:
    print(f"{col} have {df[col].nunique()} unique values")

### Data Wrangling

In [None]:
# converting the type of col 'Date' to datetime64[ns] format
df['Date'] = pd.to_datetime(df['Date'], format='%b-%y')

In [None]:
# checking the updated datatype of col 'Date'
df['Date'].dtypes

In [None]:
# sorting the df chronologically (if not sorted earlier)
df = df.sort_values('Date').reset_index(drop=True)

In [None]:
print(f"Range of Date is : {df['Date'].min()} , {df['Date'].max()}")

In [None]:
# adding new col 'Year' 'Month' for better understanding
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month

In [None]:
# checking whether new columns are added successfully or not
df.head(2)

In [None]:
# as we added new cols 'Year' and 'Month' we can drop the col 'Date'
df.drop('Date',axis=1,inplace=True)

In [None]:
# checking
df.head(2)

### Data Vizualization

Scatter Plot

In [None]:
# Chart - 1 visualization code
plt.figure(figsize=(15, 5))

# Relation between 'Close' and 'Open'
plt.subplot(2, 3, 1)
sns.scatterplot(x='Open', y='Close', data=df)
plt.title('Open vs Close')

# Relation between 'Close' and 'Low'
plt.subplot(2, 3, 2)
sns.scatterplot(x='Low', y='Close', data=df)
plt.title('Low vs Close')

# Relation between 'Close' and 'High'
plt.subplot(2, 3, 3)
sns.scatterplot(x='High', y='Close', data=df)
plt.title('High vs Close')

# Relation between 'Close' and 'Month'
plt.subplot(2, 3, 4)
sns.scatterplot(x='Month', y='Close', data=df)
plt.title('Month vs Close')

# Relation between 'Close' and 'Month'
plt.subplot(2, 3, 5)
sns.scatterplot(x='Year', y='Close', data=df)
plt.title('Year vs Close')

plt.tight_layout()
plt.show()


Line Plot

In [None]:
# Calulating the average Close and Open price
yearly_avg = df.groupby('Year')[['Close','Open']].mean().reset_index()

plt.figure(figsize=(10, 5))
sns.lineplot(data=yearly_avg, x='Year', y='Close', marker='o', color='blue',label='closing price')
sns.lineplot(data=yearly_avg, x='Year', y='Open', marker='o', color='green',label='opening price')
plt.title('Yearly trend')
plt.xlabel('Year')
plt.ylabel('Avg Close Price')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


Box plot

In [None]:
for i, col in enumerate(df.columns[:4]):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(x=df[col], color='skyblue')
    plt.title(f'Box Plot of {col}')

plt.tight_layout()
plt.show()



Heapmap

In [None]:
# visualization code
plt.figure(figsize=(6, 4))
sns.heatmap(df[['Open', 'High', 'Low', 'Month','Year' ,'Close']].corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


Pair Plot

In [None]:
# visualization code

plt.figure(figsize=(5,5))

selected_features = ['Open','Low','High','Year','Month','Close']
sns.pairplot(df, vars=selected_features)
plt.show()

### Hypothesis Testing

Statement 1-

Null Hypothesis (H₀): The mean of Open prices = mean of Close prices

Alternative Hypothesis (H₁): The mean of Open prices ≠ mean of Close prices

In [None]:
# import libraries
from scipy.stats import ttest_rel

# Paired sample t-test
t_stat, p_value = ttest_rel(df['Open'], df['Close'])

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# Significance level
alpha = 0.05

# Conclusion
if p_value < alpha:
    print("Reject H0: No significant difference between Open and Close price means.")
else:
    print("Fail to reject H0: There is a significant difference between Open and Close price means.")


Statement 2-

Null Hypothesis (H0): There is no significant difference in the mean closing price before and after the 2018 fraud case.

Alternative Hypothesis (H1): There is a significant difference in the mean closing price before and after the 2018 fraud case.

In [None]:
from scipy.stats import ttest_ind

# split the data into before and after 2018
before_2018 = df[df['Year'] < 2018]['Close']
after_2018 = df[df['Year'] >= 2018]['Close']

# perfoming independent t-test
t_stat, p_value = ttest_ind(before_2018, after_2018, equal_var=False)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# significance level
alpha = 0.05

if p_value < alpha:
    print("Reject H0: Significant difference in mean closing prices before and after 2018.")
else:
    print("Fail to reject H0: No significant difference in mean closing prices before and after 2018.")

Statement 3 -

Null Hypothesis (H0): The average daily price range (High - Low) is less than or equal to a certain threshold.

Alternative Hypothesis (H1): The average daily price range is greater than a certain threshold.

In [None]:
from scipy.stats import ttest_1samp

# Calculate daily price range
df['Price_Range'] = df['High'] - df['Low']

# Perform one-sample t-test (test against value = 5)
t_stat, p_value = ttest_1samp(df['Price_Range'], 5)

print(f"T-statistic: {t_stat}")
print(f"P-value (two-tailed): {p_value}")

# Convert to one-tailed p-value for H1: mean > 5
p_value_one_tailed = p_value / 2

# Significance level
alpha = 0.05

# Conclusion
if t_stat > 0 and p_value_one_tailed < alpha:
    print("Reject H0: The average daily price range is significantly greater than 5.")
else:
    print("Fail to reject H0: No significant evidence that average range exceeds 5.")

### Feature Engineering & Data Pre-processing

Handling Outliers

In [None]:
# Making copy of dataset
df1 = df.copy()

# Define a function to remove outliers using IQR
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out rows where values are outside the IQR bounds
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

# List of numeric columns to check for outliers
cols_to_check = ['Open', 'High', 'Low', 'Close']

# Apply the function
df = remove_outliers_iqr(df, cols_to_check)

df.shape

Feature Manipulation

In [None]:
# Closing price from the previous trading day
df['Close_Lag_1'] = df['Close'].shift(1)

# Daily Open-Close difference
df['Daily_Open_Close_Diff'] = df['Close'] - df['Open']

df.head()

In [None]:
# Close_Lag_1 will have NaN in the first row since there's no previous day.
df.dropna(inplace=True)

Feature Selection

In [None]:
# Select relevant features
features = ['Open', 'High', 'Low', 'Close_Lag_1', 'Price_Range', 'Daily_Open_Close_Diff']

# Calculate Pearson correlation with 'Close'
correlations = df[features + ['Close']].corr()['Close'].drop('Close')

print(correlations)

Data Transformation

In [None]:
# Check the skrewness of newly added features
for i, col in enumerate(df.columns[6:]):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'{col}')

plt.tight_layout()
plt.show()

In [None]:
# Log Transform - needed when dats is skewed
df['Open_log'] = np.log1p(df['Open'])
df['High_log'] = np.log1p(df['High'])
df['Low_log'] = np.log1p(df['Low'])
df['Close_log'] = np.log1p(df['Close'])
df['Price_Range_log'] = np.log1p(df['Price_Range'])
df['Close_Lag_1_log'] = np.log1p(df['Close_Lag_1'])

df.head()

Data Scaling

In [None]:
# Data Scaling
from sklearn.preprocessing import StandardScaler

# Use log-transformed features
features_to_scale = ['Open_log','High_log','Low_log','Close_log','Price_Range_log','Close_Lag_1_log']

# standardization of log-transformed features
scaler = StandardScaler()
df = df.copy()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Check the result
df.head()

Data Splitting

In [None]:
# splitting data into 80:20 proportion

# Importing Library
from sklearn.model_selection import train_test_split

# Features for Linear regression ML Model
X = df[['Open_log', 'High_log', 'Low_log', 'Close_Lag_1_log']]
y = df['Close_log']

# split the dataset - Linear regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42  )

# Features for Random Forest Regressor ML Model
X_rf = df[['Open', 'High', 'Low', 'Close_Lag_1']]
y_rf = df['Close']

# split the dataset - Random forest
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42  )


### ML Model

####**Linear Regression**

*Input features:* Open Price , High Price , Low Price after log transformation and scaling

*Target feature:* Close Price

In this model we predict Close_log (log transformation value of close price) and then we apply inverse log to get actual Close price.



In [None]:
# Required Liraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the Linear Regression model
model_reg = LinearRegression()

# Fit the Algorithm
model_reg.fit(X_train, y_train)

# Predict on the model
y_pred_log = model_reg.predict(X_test)

# Apply inverse logarithm to get Close price
predicted_close_price = np.exp(y_pred_log)

In [None]:
# Visualizing evaluation Metric Score chart
mse = mean_squared_error(y_test, y_pred_log)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_log)

print("R² Score : ",r2)
print("MSE : ", mse)
print("RMSE : ", rmse)

####**Random Forest**

*Input features:* Open Price , High Price , Low Price

*Target feature:* Close Price

In this model we predict Close Price directly



In [None]:
# import libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_rf_train, y_rf_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_rf_test)

In [None]:
# Evaluate performance
mse_rf = mean_squared_error(y_rf_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_rf_test, y_pred_rf)

print(f"Random Forest - MSE: {mse_rf:.2f}")
print(f"Random Forest - RMSE: {rmse_rf:.2f}")
print(f"Random Forest - R² Score: {r2_rf:.2f}")

### Cross Validation

In [None]:
# import libraries
from sklearn.model_selection import cross_val_score

Linear regression

In [None]:
# 5 k-fold
r2_score = cross_val_score(model_reg, X, y, cv=5, scoring='r2')

# display
print("Cross-Validation R2 Scores:", r2_score)
print("Average R2 Score:", r2_score.mean())

Random Forest

In [None]:
# 5 k-fold
r2_score_rf = cross_val_score(rf_model, X_rf, y_rf, cv=5, scoring='r2')

# displaying scores
print("Cross-Validation R2 Scores:", r2_score_rf)
print("Average R2 Score:", r2_score_rf.mean())

###**Choosen Model**





Linear Regression model with log transformation as the final prediction model. Because -

* It achieved a higher average R² Score (0.9744) during cross-validation than Random Forest (0.9350).
* The data showed a strong linear relationship between input features (Open, High, Low, etc.) and Close, which suits linear models well.
* Model interpretability is crucial for business reporting, and Linear Regression provides clear coefficient explanations, which Random Forest lacks.