<a href="https://colab.research.google.com/github/20adityasingh/MLPractice1/blob/main/MLPractice1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
data = pd.read_csv("Raw_Housing_Prices.csv")

In [169]:
data["year_house_was_sold"] = pd.DatetimeIndex(data["Date House was Sold"]).year

In [170]:
data.dropna(subset=["Sale Price"], inplace=True, axis=0)

In [171]:
impute = SimpleImputer(missing_values=np.nan, strategy="median")
for x in data.columns:
    if data[x].isnull().sum() == 0 or data[x].dtype == "object":
        continue
    else:
        data[x] = impute.fit_transform(data[x].values.reshape(-1, 1))[:, 0]

In [172]:
data["No of Times Visited"].unique()

array([nan, 'Thrice', 'Four', 'Twice', 'Once'], dtype=object)

In [173]:
mapping = {"Once":1, "Twice":2, "Thrice":3, "Four":4}
data["No of Times Visited"] = data["No of Times Visited"].map(mapping)
data["No of Times Visited"].unique()


array([nan,  3.,  4.,  2.,  1.])

In [174]:
data['No of Times Visited'].corr(data['Sale Price'])

0.3035210049184712

In [175]:
data['year_since_renovation'] = np.where(data['Renovated Year'] == 0, 0, abs(data['year_house_was_sold'] - data['Renovated Year']))

In [176]:
data.drop(['No of Times Visited'], axis=1, inplace=True)
data["ever_renovated"] = np.where(data['year_since_renovation'] == 0, "No", "Yes")

In [177]:
data.drop(['Date House was Sold', 'Renovated Year','year_house_was_sold'], axis=1, inplace=True)

In [178]:
data = pd.get_dummies(data,columns=["Waterfront View"] ,drop_first=True)

In [179]:
data = pd.get_dummies(data,columns=["Condition of the House","ever_renovated"] ,drop_first=True)

In [180]:
data["Sale Price"].describe()

Unnamed: 0,Sale Price
count,21609.0
mean,540198.4
std,367389.0
min,75000.0
25%,321950.0
50%,450000.0
75%,645000.0
max,7700000.0


In [181]:
q1 = np.quantile(data["Sale Price"], 0.25)
q3 = np.quantile(data["Sale Price"], 0.75)
iqr = q3 - q1
upper_limit = q3 + 1.5 * iqr
lower_limit = q1 - 1.5 * iqr
def limit_price(x):
    if x > upper_limit:
        return upper_limit
    elif x < lower_limit:
        return lower_limit
    else:
        return x
data["Sale Price"] = data["Sale Price"].apply(limit_price)

In [182]:
data["Sale Price"].describe()

Unnamed: 0,Sale Price
count,21609.0
mean,511618.6
std,250062.0
min,75000.0
25%,321950.0
50%,450000.0
75%,645000.0
max,1129575.0


In [183]:
ziptable = data.groupby("Zipcode").agg({"Sale Price":"mean"}).sort_values("Sale Price", ascending=True)
ziptable = pd.DataFrame(ziptable)
ziptable["Zipcode_Group"] = pd.cut(ziptable["Sale Price"], bins=10, labels=["Group 1", "Group 2", "Group 3", "Group 4", "Group 5", "Group6", "Group 7", "Group 8", "Group 9", "Group 10"], include_lowest=True)
ziptable = ziptable.drop("Sale Price", axis=1)
data = pd.merge(data, ziptable, on="Zipcode", how="left")
data = data.drop(["Zipcode", "ID"], axis=1)
data = pd.get_dummies(data, columns=["Zipcode_Group"], drop_first=True)

In [184]:
data.shape[1]

30

In [185]:
for x in data.columns:
    if data[x].dtype == "bool":
      mapp = {True:1, False:0}
      data[x] = data[x].map(mapp)

In [186]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
data_copy = data.copy()
data_copy.drop("Sale Price", axis=1, inplace=True)
vif_data = pd.DataFrame()
vif_data["feature"] = data_copy.columns
vif_data["VIF"] = [variance_inflation_factor(data_copy.values, i) for i in range(data_copy.shape[1])]
while vif_data["VIF"].max() > 5:
    feature_with_highest_vif = vif_data["feature"][vif_data["VIF"].idxmax()]
    data_copy.drop(feature_with_highest_vif, axis=1, inplace=True)
    vif_data = pd.DataFrame()
    vif_data["feature"] = data_copy.columns
    vif_data["VIF"] = [variance_inflation_factor(data_copy.values, i) for i in range(data_copy.shape[1])]
vif_data


Unnamed: 0,feature,VIF
0,Lot Area (in Sqft),2.357597
1,Area of the House from Basement (in Sqft),3.747353
2,Basement Area (in Sqft),1.53151
3,Age of House (in Years),3.629218
4,Lot Area after Renovation (in Sqft),2.55501
5,year_since_renovation,2.851381
6,Waterfront View_Yes,1.031827
7,Condition of the House_Excellent,1.282966
8,Condition of the House_Good,1.643269
9,Condition of the House_Okay,1.029246


In [187]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [188]:
ss = StandardScaler()
X = data_copy
y = data["Sale Price"]
X = ss.fit_transform(X)

In [189]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print("R2 Score: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))

R2 Score:  0.7869923958297907
MSE:  13639601509.033136
MAE:  84595.13985459432
