Data exploration

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
train_num = train.select_dtypes(include=['number'])
corr_matrix = train_num.corr()

plt.figure(figsize=(50, 60))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.show()

Replacing missing values

In [None]:
for col in train.columns:
    if train[col].isnull().any():
        print(str(col) + " " + str(train[col].isnull().sum()) + " " + str(train[col].dtype))

In [None]:
for col in train.columns:
    if train[col].isnull().any():  #
        mode = train[col].mode()[0] 
        train[col] = train[col].fillna(mode)  


Linear regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

X = train.drop(columns=['SalePrice'])  
y = train['SalePrice'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create a transformer for preprocessing (handle both categorical and numerical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),  # No transformation for numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # One-hot encode categorical features
    ])

# Create the pipeline with preprocessor and Linear Regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

rmse = np.sqrt(mse)
print(f'Root Mean Squared Error (RMSE): {rmse}')

r2 = pipeline.score(X_test, y_test)
print(f'R^2 Score: {r2}')


Ridge regression 

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=1.0))  
])

lasso_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso', Lasso(alpha=0.1))  
])

ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

ridge_pred = ridge_model.predict(X_test)
lasso_pred = lasso_model.predict(X_test)

ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
ridge_r2 = r2_score(y_test, ridge_pred)

lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
lasso_r2 = r2_score(y_test, lasso_pred)

print(f"Ridge Regression - RMSE: {ridge_rmse}, R²: {ridge_r2}")
print(f"Lasso Regression - RMSE: {lasso_rmse}, R²: {lasso_r2}")

ridge_coeffs = ridge_model.named_steps['ridge'].coef_
lasso_coeffs = lasso_model.named_steps['lasso'].coef_

feature_names = numerical_cols.tolist() + list(ridge_model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_cols))
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Ridge Coefficients': ridge_coeffs,
    'Lasso Coefficients': lasso_coeffs
}).sort_values(by='Ridge Coefficients', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Ridge Coefficients', y='Feature', data=coefficients_df)
plt.title('Ridge Regression - Feature Coefficients')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='Lasso Coefficients', y='Feature', data=coefficients_df)
plt.title('Lasso Regression - Feature Coefficients')
plt.show()
