# Import Necessary Libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.utils import shuffle
import matplotlib.pyplot as pyplot
import pickle
from matplotlib import style
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/mwitiderrick/stockprice/master/NSE-TATAGLOBAL.csv')
df

Unnamed: 0,Date,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
0,2018-09-28,234.05,235.95,230.20,233.50,233.75,3069914,7162.35
1,2018-09-27,234.55,236.80,231.10,233.80,233.25,5082859,11859.95
2,2018-09-26,240.00,240.00,232.50,235.00,234.25,2240909,5248.60
3,2018-09-25,233.30,236.75,232.00,236.25,236.10,2349368,5503.90
4,2018-09-24,233.55,239.20,230.75,234.00,233.30,3423509,7999.55
...,...,...,...,...,...,...,...,...
2030,2010-07-27,117.60,119.50,112.00,118.80,118.65,586100,694.98
2031,2010-07-26,120.10,121.00,117.10,117.10,117.60,658440,780.01
2032,2010-07-23,121.80,121.95,120.25,120.35,120.65,281312,340.31
2033,2010-07-22,120.30,122.00,120.25,120.75,120.90,293312,355.17


# Data Exploratory & Prerocessing

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2035 entries, 0 to 2034
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  2035 non-null   object 
 1   Open                  2035 non-null   float64
 2   High                  2035 non-null   float64
 3   Low                   2035 non-null   float64
 4   Last                  2035 non-null   float64
 5   Close                 2035 non-null   float64
 6   Total Trade Quantity  2035 non-null   int64  
 7   Turnover (Lacs)       2035 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 127.3+ KB


In [4]:
df.isnull().sum()

Date                    0
Open                    0
High                    0
Low                     0
Last                    0
Close                   0
Total Trade Quantity    0
Turnover (Lacs)         0
dtype: int64

In [5]:
df.isna().sum()

Date                    0
Open                    0
High                    0
Low                     0
Last                    0
Close                   0
Total Trade Quantity    0
Turnover (Lacs)         0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
df.describe()

Unnamed: 0,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
count,2035.0,2035.0,2035.0,2035.0,2035.0,2035.0,2035.0
mean,149.713735,151.992826,147.293931,149.474251,149.45027,2335681.0,3899.980565
std,48.664509,49.413109,47.931958,48.73257,48.71204,2091778.0,4570.767877
min,81.1,82.8,80.0,81.0,80.95,39610.0,37.04
25%,120.025,122.1,118.3,120.075,120.05,1146444.0,1427.46
50%,141.5,143.4,139.6,141.1,141.25,1783456.0,2512.03
75%,157.175,159.4,155.15,156.925,156.9,2813594.0,4539.015
max,327.7,328.75,321.65,325.95,325.75,29191020.0,55755.08


In [8]:
df.shape

(2035, 8)

In [9]:
df['Date'] = pd.to_datetime(df['Date'])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df.drop('Date', axis=1, inplace=True)

In [10]:
df

Unnamed: 0,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs),year,month,day
0,234.05,235.95,230.20,233.50,233.75,3069914,7162.35,2018,9,28
1,234.55,236.80,231.10,233.80,233.25,5082859,11859.95,2018,9,27
2,240.00,240.00,232.50,235.00,234.25,2240909,5248.60,2018,9,26
3,233.30,236.75,232.00,236.25,236.10,2349368,5503.90,2018,9,25
4,233.55,239.20,230.75,234.00,233.30,3423509,7999.55,2018,9,24
...,...,...,...,...,...,...,...,...,...,...
2030,117.60,119.50,112.00,118.80,118.65,586100,694.98,2010,7,27
2031,120.10,121.00,117.10,117.10,117.60,658440,780.01,2010,7,26
2032,121.80,121.95,120.25,120.35,120.65,281312,340.31,2010,7,23
2033,120.30,122.00,120.25,120.75,120.90,293312,355.17,2010,7,22


# Data Visualzation

In [11]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
fig = px.line(df, x=df.index, y='Close', title='Stock Closing Price Over Time')
fig.show()

In [13]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Candlestick(x=df.index,
                                     open=df['Open'],
                                     high=df['High'],
                                     low=df['Low'],
                                     close=df['Close'])])
fig.update_layout(title='Candlestick Chart', xaxis_title='Date', yaxis_title='Price')
fig.show()

In [14]:
# Group by year and month to show total turnover
df_grouped = df.groupby(['year', 'month']).agg({'Turnover (Lacs)': 'sum'}).reset_index()

fig = px.bar(df_grouped, x='month', y='Turnover (Lacs)', color='year', title='Monthly Turnover')
fig.show()

In [15]:
fig = px.scatter(df, x='Total Trade Quantity', y='Close', title='Trade Quantity vs. Close Price', color='month')
fig.show()

In [16]:
fig = px.pie(df, values='Turnover (Lacs)', names='month', color='year', title='Cumulative Turnover by Year')
fig.show()

# Select Features

In [17]:
# Define your target variable (e.g., 'Close' price)
target_variable = 'Close'

# Define your features (exclude the target variable)
features = [col for col in df.columns if col != target_variable]

# Split the data into features (X) and target (y)
X = df[features]
y = df[target_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Feature scaling (optional but can improve performance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Linear Regression model

In [19]:
# Create and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_linear = linear_model.predict(X_test)

# Evaluate the model
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Print Linear Regression results
print(f'Linear Regression:')
print(f'Mean Squared Error: {mse_linear}')
print(f'Root Mean Squared Error: {rmse_linear}')
print(f'R-squared: {r2_linear}')
print(f"Linear Regression Accuracy: {r2_linear * 100:.2f}%")


Linear Regression:
Mean Squared Error: 0.15835670661571058
Root Mean Squared Error: 0.3979405817653065
R-squared: 0.9999336270386097
Linear Regression Accuracy: 99.99%


# Decision Tree model

In [20]:
# Create and train the Decision Tree model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_tree = tree_model.predict(X_test)

# Evaluate the model
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)
r2_tree = r2_score(y_test, y_pred_tree)

# Print Decision Tree results
print(f'\nDecision Tree:')
print(f'Mean Squared Error: {mse_tree}')
print(f'Root Mean Squared Error: {rmse_tree}')
print(f'R-squared: {r2_tree}')
print(f"Decision Tree Accuracy: {r2_tree * 100:.2f}%")



Decision Tree:
Mean Squared Error: 0.7963452088452095
Root Mean Squared Error: 0.8923817618291006
R-squared: 0.9996662232315279
Decision Tree Accuracy: 99.97%


# Random Forest model

In [21]:
# Create and train the Random Forest model
forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
forest_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_forest = forest_model.predict(X_test)

# Evaluate the model
mse_forest = mean_squared_error(y_test, y_pred_forest)
rmse_forest = np.sqrt(mse_forest)
r2_forest = r2_score(y_test, y_pred_forest)

# Print Random Forest results
print(f'\nRandom Forest:')
print(f'Mean Squared Error: {mse_forest}')
print(f'Root Mean Squared Error: {rmse_forest}')
print(f'R-squared: {r2_forest}')
print(f"Random Forest Accuracy: {r2_forest * 100:.2f}%")



Random Forest:
Mean Squared Error: 0.421169012285023
Root Mean Squared Error: 0.6489753556838834
R-squared: 0.999823472998469
Random Forest Accuracy: 99.98%


# Thank you 