In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/housing-prices-dataset/Housing.csv


In [3]:
#In this model, we will learn about prediction of house prices given the provided features

In [2]:
#We start by uploading our required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


In [4]:
data = pd.read_csv("/kaggle/input/housing-prices-dataset/Housing.csv")
data.head()

#We Load the dataset

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [14]:
print(data.columns)

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')


In [5]:
#We check Missing Values

missing = data.isnull().sum()
print(missing[missing > 0])


Series([], dtype: int64)


In [6]:
#data cleaning

# Drop columns with too many missing values 
data = data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, errors='ignore')

# Fill numerical columns with median
num_cols = data.select_dtypes(include='number').columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Fill categorical columns with mode
cat_cols = data.select_dtypes(include='object').columns
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])


In [8]:
# Encode categorical data to numbers
from sklearn.preprocessing import LabelEncoder

for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))


In [13]:
#we now split data into 80 training and 20 testing

from sklearn.model_selection import train_test_split


X = data.drop(['price'], axis=1)
y = data['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)


In [15]:
#feature standardization

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [16]:
#training two models

lasso = Lasso(alpha=0.001)
ridge = Ridge(alpha=10)

lasso.fit(X_train_scaled, y_train)
ridge.fit(X_train_scaled, y_train)

In [17]:
#evaluation

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

lasso_pred = lasso.predict(X_test_scaled)
ridge_pred = ridge.predict(X_test_scaled)

print("Lasso RMSE:", rmse(y_test, lasso_pred))
print("Ridge RMSE:", rmse(y_test, ridge_pred))


Lasso RMSE: 924562.7245147332
Ridge RMSE: 920544.0937234461


In [18]:
output = pd.DataFrame({
    "Actual": y_test,
    "Lasso_Pred": lasso_pred,
    "Ridge_Pred": ridge_pred
})
output.to_csv("predictions.csv", index=False)
