load dataset and changle target name

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt


df = pd.read_csv("regression_data.csv")

df.columns = [str(col) for col in df.columns]


df = df.rename(columns={df.columns[-1]: "SalePrice"})

print(df.head())
print(df.info())
print(df.describe())

   1  60  RL    65   8450  Pave   NA  Reg  Lvl  AllPub  ... 0.8 NA.2   NA.3  \
0  2  20  RL  80.0   9600  Pave  NaN  Reg  Lvl  AllPub  ...   0  NaN    NaN   
1  3  60  RL  68.0  11250  Pave  NaN  IR1  Lvl  AllPub  ...   0  NaN    NaN   
2  4  70  RL  60.0   9550  Pave  NaN  IR1  Lvl  AllPub  ...   0  NaN    NaN   
3  5  60  RL  84.0  14260  Pave  NaN  IR1  Lvl  AllPub  ...   0  NaN    NaN   
4  6  50  RL  85.0  14115  Pave  NaN  IR1  Lvl  AllPub  ...   0  NaN  MnPrv   

   NA.4  0.9 2.2  2008  WD   Normal  SalePrice  
0   NaN    0   5  2007  WD   Normal     181500  
1   NaN    0   9  2008  WD   Normal     223500  
2   NaN    0   2  2006  WD  Abnorml     140000  
3   NaN    0  12  2008  WD   Normal     250000  
4  Shed  700  10  2009  WD   Normal     143000  

[5 rows x 81 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   1          1459 

checking null and duplicates

In [3]:
print(df.isnull().sum())
print(f'Duplicates: {df.duplicated().sum()}')


1              0
60             0
RL             0
65           259
8450           0
            ... 
2.2            0
2008           0
WD             0
Normal         0
SalePrice      0
Length: 81, dtype: int64
Duplicates: 0


droping null columns

In [4]:
df= df.dropna(axis=1)

print(df.isnull().sum())

1            0
60           0
RL           0
8450         0
Pave         0
            ..
2.2          0
2008         0
WD           0
Normal       0
SalePrice    0
Length: 62, dtype: int64


one hot encoding for obeject values 

In [5]:
df_encoded = pd.get_dummies(df, drop_first=True)

# Check shape and sample
print(df_encoded.shape)
print(df_encoded.head())

(1459, 190)
   1  60   8450  7  5  2003  2003.1  706  0  150  ...  WD_ConLI  WD_ConLw  \
0  2  20   9600  6  8  1976    1976  978  0  284  ...     False     False   
1  3  60  11250  7  5  2001    2002  486  0  434  ...     False     False   
2  4  70   9550  7  5  1915    1970  216  0  540  ...     False     False   
3  5  60  14260  8  5  2000    2000  655  0  490  ...     False     False   
4  6  50  14115  5  5  1993    1995  732  0   64  ...     False     False   

   WD_New  WD_Oth  WD_WD  Normal_AdjLand  Normal_Alloca  Normal_Family  \
0   False   False   True           False          False          False   
1   False   False   True           False          False          False   
2   False   False   True           False          False          False   
3   False   False   True           False          False          False   
4   False   False   True           False          False          False   

   Normal_Normal  Normal_Partial  
0           True           False  
1         

checking the highest correaltions

In [6]:
# Compute correlations with SalePrice
correlation_with_target = df_encoded.corr()["SalePrice"].sort_values(ascending=False)

# Display top 15 correlations
print(correlation_with_target.head(15))


SalePrice          1.000000
7                  0.790974
1710               0.708599
2.1                0.640383
548                0.623400
856                0.613760
856.1              0.606197
2                  0.560619
8                  0.533680
2003               0.522867
2003.1             0.507047
PConc_PConc        0.497702
0.3                0.467319
Gd_Gd              0.452458
CollgCr_NridgHt    0.402229
Name: SalePrice, dtype: float64


selecting best features and testing model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

# Exclude SalePrice from correlation list
feature_ranking = correlation_with_target.drop("SalePrice")



selected_features = ['7', '1710', '2.1', '548', '856', '856.1', '2', '8', '2003', '2003.1','PConc_PConc','0.3','Gd_Gd','CollgCr_NridgHt']
X = df_encoded[selected_features]
y = df_encoded["SalePrice"]
y = np.log1p(df_encoded["SalePrice"])

In [8]:
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
)


In [9]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
    
r2 = r2_score(y_test, y_pred)


# Show results

print(r2)

0.8166875951816633


1. Ridge Regression (L2 regularization)

In [10]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.0001)  # you can tune alpha
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print("R² Ridge:", r2_score(y_test, y_pred))


R² Ridge: 0.8166875950792027


2. Lasso Regression (L1 regularization)

In [11]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

print("R² Lasso:", r2_score(y_test, y_pred))


R² Lasso: 0.8161219676842051


3. ElasticNet (mix of L1 & L2)

In [12]:
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=1000)
enet.fit(X_train, y_train)
y_pred = enet.predict(X_test)

print("R² ElasticNet:", r2_score(y_test, y_pred))


R² ElasticNet: 0.8164773296997745


In [16]:
# 1) Convert booleans to int
X = df_encoded.astype(float)  # ensures all numeric
# OR: X = df_encoded.apply(pd.to_numeric)

# 2) Select your features
features = ['7', '1710', '2.1', '548', '856', '856.1', '2', '8', 
            '2003', '2003.1', 'PConc_PConc', '0.3', 'Gd_Gd', 'CollgCr_NridgHt']

X = X[features]

# 3) Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4) Standardize AFTER splitting
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)


In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
import numpy as np

# Define parameters to try
penalties = ["l1", "l2", "elasticnet"]
alphas = [0.0001, 0.001, 0.01]
etas = [0.001, 0.01, 0.1]  # learning rates

best_results = []

for penalty in penalties:
    for alpha in alphas:
        for eta in etas:
            sgd = SGDRegressor(
                penalty=penalty,
                alpha=alpha,
                learning_rate="constant",
                eta0=eta,
                max_iter=10000,
                tol=1e-5,
                random_state=42
            )
            sgd.fit(X_train_std, y_train)
            y_pred = sgd.predict(X_test_std)
            score = r2_score(y_test, y_pred)
            best_results.append((penalty, alpha, eta, score))

# Sort and print top configs
best_results.sort(key=lambda x: x[3], reverse=True)
for penalty, alpha, eta, score in best_results[:10]:
    print(f"SGD | penalty={penalty} | alpha={alpha} | eta0={eta} => R² = {score:.4f}")


SGD | penalty=l2 | alpha=0.01 | eta0=0.001 => R² = 0.8180
SGD | penalty=elasticnet | alpha=0.001 | eta0=0.001 => R² = 0.8177
SGD | penalty=l2 | alpha=0.001 | eta0=0.001 => R² = 0.8177
SGD | penalty=l1 | alpha=0.0001 | eta0=0.001 => R² = 0.8176
SGD | penalty=elasticnet | alpha=0.0001 | eta0=0.001 => R² = 0.8176
SGD | penalty=l2 | alpha=0.0001 | eta0=0.001 => R² = 0.8176
SGD | penalty=elasticnet | alpha=0.01 | eta0=0.001 => R² = 0.8175
SGD | penalty=l1 | alpha=0.001 | eta0=0.001 => R² = 0.8174
SGD | penalty=l1 | alpha=0.01 | eta0=0.001 => R² = 0.8130
SGD | penalty=elasticnet | alpha=0.0001 | eta0=0.01 => R² = 0.8088
