In [1]:
import pandas as pd
from statsmodels.formula.api import ols

In [2]:
df = pd.read_csv('iris.csv')

In [3]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.columns = ['SL', 'SW', 'PL', 'PW', 'species']
df.head()

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### OSL

In [8]:
model = ols(formula='SL ~ SW', data = df).fit()

In [9]:
model.summary()

0,1,2,3
Dep. Variable:,SL,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,2.074
Date:,"Tue, 11 Nov 2025",Prob (F-statistic):,0.152
Time:,13:19:06,Log-Likelihood:,-183.0
No. Observations:,150,AIC:,370.0
Df Residuals:,148,BIC:,376.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5262,0.479,13.628,0.000,5.580,7.473
SW,-0.2234,0.155,-1.440,0.152,-0.530,0.083

0,1,2,3
Omnibus:,4.389,Durbin-Watson:,0.952
Prob(Omnibus):,0.111,Jarque-Bera (JB):,4.237
Skew:,0.36,Prob(JB):,0.12
Kurtosis:,2.6,Cond. No.,24.2


In [10]:
model = ols(formula='SL ~ PL', data = df).fit()

In [12]:
model.summary() # SL = 0.4089 * PL + 4.3066

0,1,2,3
Dep. Variable:,SL,R-squared:,0.76
Model:,OLS,Adj. R-squared:,0.758
Method:,Least Squares,F-statistic:,468.6
Date:,"Tue, 11 Nov 2025",Prob (F-statistic):,1.0400000000000001e-47
Time:,13:23:58,Log-Likelihood:,-77.02
No. Observations:,150,AIC:,158.0
Df Residuals:,148,BIC:,164.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.3066,0.078,54.939,0.000,4.152,4.462
PL,0.4089,0.019,21.646,0.000,0.372,0.446

0,1,2,3
Omnibus:,0.207,Durbin-Watson:,1.867
Prob(Omnibus):,0.902,Jarque-Bera (JB):,0.346
Skew:,0.069,Prob(JB):,0.841
Kurtosis:,2.809,Cond. No.,10.3


In [13]:
model.predict(df)

0      4.879095
1      4.879095
2      4.838202
3      4.919987
4      4.879095
         ...   
145    6.432999
146    6.351215
147    6.432999
148    6.514784
149    6.392107
Length: 150, dtype: float64

### SKLEARN LINER REGRESSION

In [75]:
from sklearn.linear_model import LinearRegression

In [76]:
model = LinearRegression().fit(X = df[['PL']], y = df['PW'])

In [77]:
model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [78]:
model.coef_

array([0.41575542])

In [22]:
model.predict(df[['PL']])

array([0.21898206, 0.21898206, 0.17740652, 0.2605576 , 0.21898206,
       0.34370869, 0.21898206, 0.2605576 , 0.21898206, 0.2605576 ,
       0.2605576 , 0.30213314, 0.21898206, 0.09425544, 0.13583098,
       0.2605576 , 0.17740652, 0.21898206, 0.34370869, 0.2605576 ,
       0.34370869, 0.2605576 , 0.0526799 , 0.34370869, 0.42685977,
       0.30213314, 0.30213314, 0.2605576 , 0.21898206, 0.30213314,
       0.30213314, 0.2605576 , 0.2605576 , 0.21898206, 0.2605576 ,
       0.13583098, 0.17740652, 0.21898206, 0.17740652, 0.2605576 ,
       0.17740652, 0.17740652, 0.17740652, 0.30213314, 0.42685977,
       0.21898206, 0.30213314, 0.21898206, 0.2605576 , 0.21898206,
       1.59097494, 1.50782385, 1.67412602, 1.29994614, 1.54939939,
       1.50782385, 1.59097494, 1.00891735, 1.54939939, 1.2583706 ,
       1.09206844, 1.38309723, 1.29994614, 1.59097494, 1.13364398,
       1.46624831, 1.50782385, 1.34152169, 1.50782385, 1.2583706 ,
       1.63255048, 1.29994614, 1.67412602, 1.59097494, 1.42467

In [23]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [24]:
df.head()

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [25]:
df['PW_from_PL'] = model.predict(df[['PL']])

In [26]:
df.head()

Unnamed: 0,SL,SW,PL,PW,species,PW_from_PL
0,5.1,3.5,1.4,0.2,setosa,0.218982
1,4.9,3.0,1.4,0.2,setosa,0.218982
2,4.7,3.2,1.3,0.2,setosa,0.177407
3,4.6,3.1,1.5,0.2,setosa,0.260558
4,5.0,3.6,1.4,0.2,setosa,0.218982


In [27]:
mean_absolute_error(y_true=df['PW'], y_pred=df['PW_from_PL'])

0.15647051371014092

In [30]:
mean_squared_error(y_true=df['PW'], y_pred=df['PW_from_PL'])

0.04206730919499318

In [31]:
mean_squared_error(y_true=df['PW'], y_pred=df['PW_from_PL']) ** 0.5

0.20510316719883478

# Q1

In [35]:
import pandas as pd
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split

In [33]:
df_q1 = pd.read_csv('bike.csv')
df_q1.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [36]:
df_q1_train, df_q1_test = train_test_split(df_q1, train_size=0.7, random_state=123)

In [48]:
model = ols(formula='registered~temp', data=df_q1_train)

In [49]:
model = model.fit()

In [50]:
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Tue, 11 Nov 2025",Prob (F-statistic):,1.92e-187
Time:,14:41:17,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


# Q2

In [51]:
import pandas as pd
from statsmodels.formula.api import ols

In [52]:
df_q2 = pd.read_csv('bike.csv')
df_q2.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [53]:
from sklearn.model_selection import train_test_split

In [67]:
df_q2_train, df_q2_test = train_test_split(df_q2, train_size=0.7, random_state=123)

In [55]:
model = ols(formula='casual~atemp', data=df_q2_train).fit()

In [56]:
model.summary()

0,1,2,3
Dep. Variable:,casual,R-squared:,0.219
Model:,OLS,Adj. R-squared:,0.219
Method:,Least Squares,F-statistic:,2138.0
Date:,"Tue, 11 Nov 2025",Prob (F-statistic):,0.0
Time:,14:49:18,Log-Likelihood:,-39689.0
No. Observations:,7620,AIC:,79380.0
Df Residuals:,7618,BIC:,79400.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-29.2974,1.498,-19.554,0.000,-32.234,-26.360
atemp,2.7672,0.060,46.243,0.000,2.650,2.885

0,1,2,3
Omnibus:,4125.373,Durbin-Watson:,1.973
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34148.771
Skew:,2.494,Prob(JB):,0.0
Kurtosis:,12.092,Cond. No.,74.1


In [68]:
model.predict(df_q2_test)

6495    31.499001
7050    12.626390
558     10.537120
5085    33.588271
3328    73.422763
          ...    
3733    41.986859
24      33.588271
9650    54.563988
6246    16.818766
5239     0.049261
Length: 3266, dtype: float64

In [70]:
df_q2_test['predict'] = model.predict(df_q2_test)

In [71]:
df_q2_test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,predict
6495,2012-03-07 21:00:00,1,0,1,1,18.04,21.97,62,16.9979,11,177,188,31.499001
7050,2012-04-12 03:00:00,2,0,1,1,13.12,15.15,61,16.9979,2,3,5,12.62639
558,2011-02-06 10:00:00,1,0,0,1,12.3,14.395,52,16.9979,15,74,89,10.53712
5085,2011-12-05 23:00:00,4,0,1,2,18.86,22.725,88,8.9981,8,54,62,33.588271
3328,2011-08-08 14:00:00,3,0,1,1,33.62,37.12,43,19.9995,41,109,150,73.422763


In [72]:
from sklearn.metrics import mean_squared_error

In [73]:
mean_squared_error(y_true = df_q2_test['casual'], y_pred=df_q2_test['predict']) ** 0.5

44.46237010271433

# Q3

In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error

In [98]:
df_q3 = pd.read_csv('bike.csv')
df_q3.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [99]:
df_q3_train, df_q3_test = train_test_split(df_q3, train_size = 0.7, random_state = 123)

In [100]:
model = ols(formula = 'casual ~ atemp', data = df_q3_train).fit()

In [101]:
df_q3_test['predict'] = model.predict(df_q3_test)

In [102]:
df_q3_test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,predict
6495,2012-03-07 21:00:00,1,0,1,1,18.04,21.97,62,16.9979,11,177,188,31.499001
7050,2012-04-12 03:00:00,2,0,1,1,13.12,15.15,61,16.9979,2,3,5,12.62639
558,2011-02-06 10:00:00,1,0,0,1,12.3,14.395,52,16.9979,15,74,89,10.53712
5085,2011-12-05 23:00:00,4,0,1,2,18.86,22.725,88,8.9981,8,54,62,33.588271
3328,2011-08-08 14:00:00,3,0,1,1,33.62,37.12,43,19.9995,41,109,150,73.422763


In [103]:
df_q3_test[df_q3_test['season'] == 2]['casual']

7050      2
1368     24
1535     19
7626      0
2259      3
       ... 
1941     88
7504     14
7737    171
7520    233
2100     34
Name: casual, Length: 838, dtype: int64

In [104]:
rmse_2 = mean_squared_error(y_true=df_q3_test[df_q3_test['season'] == 2]['casual'], y_pred=df_q3_test[df_q3_test['season'] == 2]['predict']) ** 0.5

In [105]:
rmse_2

55.16232790957267

In [106]:
rmse_4 = mean_squared_error(y_true=df_q3_test[df_q3_test['season'] == 4]['casual'], y_pred=df_q3_test[df_q3_test['season'] == 4]['predict']) ** 0.5

In [107]:
rmse_4

40.57002529173707

In [108]:
rmse_2 - rmse_4

14.5923026178356

In [119]:
df_q3 = pd.read_csv('bike.csv')
df_q3_2 = df_q3.loc[df_q3['season'] == 2, ]
df_q3_4 = df_q3.loc[df_q3['season'] == 4, ]

In [120]:
df_q3.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [121]:
df_q3_2.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
1323,2011-04-01 00:00:00,2,0,1,3,10.66,12.88,100,11.0014,0,6,6
1324,2011-04-01 01:00:00,2,0,1,3,10.66,12.88,100,11.0014,0,4,4
1325,2011-04-01 02:00:00,2,0,1,3,10.66,12.88,93,12.998,0,7,7
1326,2011-04-01 03:00:00,2,0,1,2,9.84,11.365,93,16.9979,0,4,4
1327,2011-04-01 04:00:00,2,0,1,2,9.84,11.365,93,16.9979,0,3,3


In [122]:
df_q3_2_train, df_q3_2_test = train_test_split(df_q3_2, train_size=0.7, random_state=123)

In [123]:
df_q3_4_train, df_q3_4_test = train_test_split(df_q3_4, train_size=0.7, random_state=123)

In [126]:
model_2 = ols(formula = 'casual ~ atemp', data = df_q3_2_train).fit()

In [127]:
model_4 = ols(formula = 'casual ~ atemp', data = df_q3_4_train).fit()

In [128]:
df_q3_2_test['predict'] = model_2.predict(df_q3_2_test)

In [129]:
df_q3_4_test['predict'] = model_4.predict(df_q3_4_test)

In [131]:
rmse_2 = mean_squared_error(y_true=df_q3_2_test['casual'], y_pred=df_q3_2_test['predict']) ** 0.5

In [132]:
rmse_4 = mean_squared_error(y_true=df_q3_4_test['casual'], y_pred=df_q3_4_test['predict']) ** 0.5

In [133]:
rmse_2 - rmse_4

8.648423450414171