In [1]:
import yfinance as yf
import pandas as pd
import seaborn as sns
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

# Preparing the Data 

In [2]:
df_1 = pd.read_excel('MCAP31122022_3.xlsx')
symbols = df_1.Symbol[:100]                   #Symbol of top 100 companies
symbol_list = []
for i in symbols:
    symbol_list.append(f"{i}.NS")

FileNotFoundError: [Errno 2] No such file or directory: 'MCAP31122022_3.xlsx'

In [None]:
# Historical data of stock prices
price_df = yf.download(symbol_list,'2016-4-1','2023-3-1', auto_adjust=True)['Close']

In [None]:
#sensex data
sensex = yf.download('^BSESN','2016-4-1','2023-3-1', auto_adjust=True)

In [None]:
# Merged Dataframe
final_df = pd.merge(price_df,sensex.Close.round(0),on='Date',how='left')
final_df = final_df[final_df['Close'].isnull()==False]
final_df = final_df.rename(columns = {'Close':'Sensex'})

In [None]:
# Checking null values
j=0
column_list=[]
for i in final_df.isnull().sum():
    if i!=0:
        print('column index',j,'number of null values',i)
        if i>200:
            column_list.append(j)
    j=j+1

In [None]:
#dropping columns having too many null values
final_df = final_df.drop(final_df.columns[column_list],axis=1)
final_df = final_df.fillna(final_df.mean())
final_df = final_df.round(0)
final_df

In [None]:
final_df.describe()      

In [None]:
#checking companies growth fluctuation
companies = final_df.columns

growth = ((final_df.max()-final_df.min())/final_df.max() )*100
growth_list =[]
for i in growth:
    growth_list.append(i)
    
df = pd.DataFrame()
df['Companies']  = companies
df['growth'] = growth_list
df

In [None]:
plt.figure(figsize = (12,4))
sns.lineplot(data= df.loc[0:100:2].nlargest(50,'growth'), y='growth',x='Companies')
plt.xticks(rotation=85)
plt.show()
# We can see adanitrans have highest fluctuation while bpcl has lowest

# Assumptions of Linear Regression

Linearity of Data

In [None]:
plt.figure(figsize=(35,52))
for i in enumerate(final_df.columns[0:-1]):
    plt.subplot(14,7,i[0]+1)
    sns.scatterplot(x=i[1] ,y='Sensex',data=final_df)

Scaling our data

In [None]:
scalar = StandardScaler()
X = final_df.iloc[:,0:-1]
y = final_df.iloc[:,-1]
X_train, X_test,y_train,y_test = train_test_split(final_df.iloc[:,0:-1],final_df.iloc[:,-1],test_size=0.3,random_state = 23)
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

 For check of multicollinearity

In [None]:
VIF = []
for i in range(X_train.shape[1]):
    VIF.append(vif(X_train,i))
pd.DataFrame({'VIF':VIF},index = final_df.iloc[:,0:-1].columns).T

Values turned out to be highly collinear

For diminshing the effect of multicolinearity we use lasso regression

In [None]:
reg_model = Lasso()
parameters ={'alpha':[1,2,5,10,100,150,500,1000],'random_state':[23]}
lasso_regressor = GridSearchCV(reg_model,parameters,scoring='neg_mean_squared_error',cv=10)
lasso_regressor.fit(X_train,y_train)

In [None]:
# Obtaning best values of alpha 
print(lasso_regressor.best_params_)
y_pred = lasso_regressor.predict(X_test)

In [None]:
#Checking normality of residuals
residual= y_pred - y_test
sns.displot(residual,kind='kde')
# Most of residuals are near zero values which is good

In [None]:
#Uniformity of residual points
sns.scatterplot(x=y_pred,y=residual)

In [None]:
# relationship between residual points
plt.plot(residual)
#no relation 

In [None]:
# Checking score of our model
r2_score( y_pred , y_test)