In [2]:
# machine learning (linear regression)
# linear regression deals with continous data and is closely assumming not accurate
# EVERYTHING IMPORTED HERE IS FOR LINEAR REGRESSION NOT CLASSIFICATION
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
#import missingno as mns pip install missingno


In [3]:
df = pd.read_csv('house_data.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# to check the missing values, percentage and make it in a dataframe
missing_columns_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum() / len(df)) * 100
total_missing_values = pd.concat([missing_columns_values, missing_columns_per], axis =1, keys = ['missing_values', 'percentage'])
total_missing_values = total_missing_values.sort_values('percentage', ascending=False)
total_missing_values.head(20)


Unnamed: 0,missing_values,percentage
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
MasVnrType,872,59.726027
FireplaceQu,690,47.260274
LotFrontage,259,17.739726
GarageQual,81,5.547945
GarageFinish,81,5.547945
GarageType,81,5.547945


In [5]:
# sometime its adivasable to drop the columns with high percentage bc filling it may affect the prediction
df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis = 1, inplace = True)

In [6]:
# to fill the other columns
numerical_data = df.select_dtypes(include= ['int', 'float'])
cathegorical_data = df.select_dtypes(include= ['object', 'category'])
for x in numerical_data:
    df[x].fillna(np.mean(df[x]), inplace = True)

    for x in cathegorical_data:
        df[x].fillna(df[x].mode()[0], inplace = True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(np.mean(df[x]), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[x].fillna(df[x].mode()[0], inplace = True)


In [7]:
# now check the missing values, percentage and make it in a dataframe again
missing_columns_values = df.isnull().sum()
missing_columns_per = (df.isnull().sum() / len(df)) * 100
total_missing_values = pd.concat([missing_columns_values, missing_columns_per], axis =1, keys = ['missing_values', 'percentage'])
total_missing_values = total_missing_values.sort_values('percentage', ascending=False)
total_missing_values.head(20)

Unnamed: 0,missing_values,percentage
Id,0,0.0
MSSubClass,0,0.0
MSZoning,0,0.0
LotFrontage,0,0.0
LotArea,0,0.0
Street,0,0.0
LotShape,0,0.0
LandContour,0,0.0
Utilities,0,0.0
LotConfig,0,0.0


In [8]:
# all the process above is called cleaning the data
# machine learing (forecasting or predicting)
# first stage in machine learning is preprocessing that is convert categorical columns to numerical because machine learning only understand number not character
# so we must change the cathegorical column to numbers

encoder = LabelEncoder()
cathegorical_data = df.select_dtypes(include=['object', 'category'])
for x in cathegorical_data:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,3,3,0,2,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,0,3,0,4,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,0,3,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,0,3,0,2,...,0,0,0,0,0,12,2008,8,4,250000


In [9]:
# after preprocessing, remove irrelevant columns that dont have an impact on predictions (eg, id, name, description, address)
# next is to seperate our data to x and y
# seperating the dependent varaible from the independent variables (x is the independent and y is the dependent)
x = df.drop(['Id', 'SalePrice'], axis=1)
y = df['SalePrice']

In [10]:
# next stage is to split our data in x and y train and test

xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size = 0.15)
# meaning that x/y test should be 15 per and x/ytrain should be 85 per

In [11]:
# note: learn about sreamlit in python

# next is to create the model 
model1 = LinearRegression()
# train the model
model1.fit(xtrain, ytrain)


In [12]:
# the next is to make prediction 
predict1 = model1.predict(xtest)
# then evaluate using any of the model below
print(mean_absolute_error(ytest,predict1))

19420.997958585136


In [13]:
print(mean_absolute_percentage_error(ytest,predict1))

0.11528550677954162


In [14]:
print(mean_squared_error(ytest,predict1))

711895774.7380131


In [15]:
print(np.sqrt(mean_squared_error(ytest,predict1)))

26681.375053359097


In [16]:
print(r2_score(ytest, predict1))

0.861355111665163


In [17]:
# you can use other model to predict
model2 = DecisionTreeRegressor()
model2.fit(xtrain,ytrain)

In [18]:
# predict and evaluate
predict2 = model2.predict(xtest)
print(mean_absolute_error(ytest,predict2))
print(mean_absolute_percentage_error(ytest,predict2))
print(mean_squared_error(ytest,predict2))
print(np.sqrt(mean_squared_error(ytest,predict2)))
print(r2_score(ytest, predict2))


26223.69406392694
0.15472154754995532
1678219427.5479453
40966.076545697484
0.6731592553427265


In [19]:
# the next stage is deployment
# remember not to include the id number
x.iloc[[0,1]]


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,3,3,0,4,0,...,61,0,0,0,0,0,2,2008,8,4
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,0,5,2007,8,4


In [20]:
# FILL IN THE DATA FOR A NEW PERSON OR HOUSE AND RUN TO SEE WHAT THE PREDICTED SALESPRICE MAY BE
model2.predict([[]])



ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by DecisionTreeRegressor.