In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso

In [2]:
# Load the dataset
data = pd.read_csv('/content/train.csv')

In [3]:
# Display the first few rows of the dataset
print(data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea  ... MoSold YrSold SaleType SaleCondition SalePrice
0   1          60       RL         65.0     8450  ...      2   2008       WD        Normal    208500
1   2          20       RL         80.0     9600  ...      5   2007       WD        Normal    181500
2   3          60       RL         68.0    11250  ...      9   2008       WD        Normal    223500
3   4          70       RL         60.0     9550  ...      2   2006       WD       Abnorml    140000
4   5          60       RL         84.0    14260  ...     12   2008       WD        Normal    250000

[5 rows x 81 columns]


In [4]:
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [5]:
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col].fillna(data[col].median(), inplace=True)

for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [7]:
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

# Select features and target variable
features = data.drop(['SalePrice', 'Id'], axis=1)
target = data['SalePrice']

# One-hot encode categorical variables
categorical_features = features.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_categorical = encoder.fit_transform(features[categorical_features])

# Scale numerical features
numeric_features = features.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(features[numeric_features])

# Combine the processed features
X_processed = np.hstack((scaled_numerical, encoded_categorical))

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, target, test_size=0.2, random_state=42)

In [9]:
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [10]:
lasso_predictions = lasso_model.predict(X_test)

In [11]:
lasso_r2 = r2_score(y_test, lasso_predictions)
print(f'Lasso Regression R²: {lasso_r2:.2f}')

Lasso Regression R²: 0.90


In [16]:
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))
print(lasso_rmse)

28178.291031947803


In [12]:
test_data = pd.read_csv('/content/test.csv')

In [13]:
for col in test_data.select_dtypes(include=['float64', 'int64']).columns:
    test_data[col].fillna(test_data[col].median(), inplace=True)

for col in test_data.select_dtypes(include=['object']).columns:
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data[col].fillna(test_data[col].mode()[0], inplace=True)


In [14]:
test_data['TotalSF'] = test_data['TotalBsmtSF'] + test_data['1stFlrSF'] + test_data['2ndFlrSF']

encoded_test_categorical = encoder.transform(test_data[categorical_features])

scaled_test_numerical = scaler.transform(test_data[numeric_features])

X_test_final = np.hstack((scaled_test_numerical, encoded_test_categorical))

In [15]:
submission_predictions = lasso_model.predict(X_test_final)

In [17]:
submission_df = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': submission_predictions
})

In [19]:
submission_df.to_csv('submission_lasso.csv', index=False)