In [76]:
import numpy as np
import pandas as pd

In [77]:
from sklearn.datasets import fetch_california_housing
df = fetch_california_housing()

In [78]:
X = df.data
y = df.target

In [79]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [80]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [81]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [82]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
lr = LinearRegression()
lr.fit(x_train, y_train)

In [83]:
y_pred = lr.predict(x_test)

In [84]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5943232652466177

In [85]:
# Lasso Regression (L1)
lasso = Lasso(alpha=0.1)
lasso.fit(x_train, y_train)
print("Lasso Regression R^2:", lasso.score(x_test, y_test))

Lasso Regression R^2: 0.5248187411573576


In [86]:
# Ridge Regression (L2)
ridge = Ridge(alpha=0.1)
ridge.fit(x_train, y_train)
print("Ridge Regression R^2:", ridge.score(x_test, y_test))

Ridge Regression R^2: 0.5943216891454707


# Now Lets try to increase the efficiency and remove unwanted columns

In [87]:
X[0]

array([   8.3252    ,   41.        ,    6.98412698,    1.02380952,
        322.        ,    2.55555556,   37.88      , -122.23      ])

In [88]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [89]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [90]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [91]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
y_train = sc.fit_transform(y_train.reshape(-1, 1))

In [92]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
y_pred = sc.inverse_transform(y_pred)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5943232652466204

In [93]:
len(X)

20640

#### So even though we have did the Scaling still the bad results are intact

In [94]:
import numpy as np
import statsmodels.api as sm


# Ensure data is numeric
X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

# Add a column of ones to X for the intercept
X = np.append(arr=np.ones((20640, 1)).astype(int), values=X, axis=1)

# Select the optimal features
X_opt = X[:, [0, 1, 2, 3, 4, 5, 6, 7, 8]]

# Fit the OLS(ordinary least squares) regression model
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

# Print the summary
print(regressor_OLS.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     3970.
Date:                Mon, 03 Jun 2024   Prob (F-statistic):               0.00
Time:                        22:53:25   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.527e+04
Df Residuals:                   20631   BIC:                         4.534e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.9419      0.659    -56.067      0.0

In [95]:
X_opt = X[:, [0,1, 2, 3 ,4 , 7, 8]]

# Fit the OLS regression model
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()

# Print the summary
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     5268.
Date:                Mon, 03 Jun 2024   Prob (F-statistic):               0.00
Time:                        22:53:25   Log-Likelihood:                -22655.
No. Observations:               20640   AIC:                         4.532e+04
Df Residuals:                   20633   BIC:                         4.538e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -37.0727      0.659    -56.263      0.0

#### so , columns 1, 2, 3 ,4 , 7, 8 are good enough for predictions.

In [96]:
df = pd.DataFrame(X)

In [97]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [98]:
df['9'] = pd.Series(y)

In [99]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,1.0,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,1.0,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,1.0,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,1.0,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [100]:
df.drop(0, axis=1,inplace=True)

In [101]:
df.corr()

Unnamed: 0,1,2,3,4,5,6,7,8,9
1,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
2,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
3,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
4,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
5,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
6,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
7,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
8,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
9,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [102]:
X = df[[1, 2, 3, 4, 7, 8]].values


In [108]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [116]:
X

array([[   8.3252    ,   41.        ,    6.98412698,    1.02380952,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708,    0.97188049,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559,    1.07344633,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273,    1.12009238,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289,    1.17191977,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698,    1.16226415,
          39.37      , -121.24      ]])

In [117]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [119]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,train_size=0.2, random_state=0)

In [120]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

In [121]:
y_pred = lr.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5786965876653413