# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn import tree




import warnings
warnings.filterwarnings("ignore")




# Read file

In [2]:
#Read and fast visualize the data 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
df = pd.read_csv("clean_housing_data.csv")
print("Shape: ",df.shape)
df.head()

Shape:  (57007, 21)


Unnamed: 0,yearMonth_sale,case_type_dk,zip_code_name,erts89_utm32_x,erts89_utm32_y,ed50_x,ed50_y,wgs84_lat,wgs84_lon,residental_area,measured_area,energy_labeled_required,energy_labeled,amount_of_toilets,year_of_construction,renovation_year,first_offer_price,lastest_announced_price,sold_price,days_on_the_market_all_broker,change_broker
0,201805,Rækkehus,København Ø,724656.3099617342,6177510.129902037,724738.3771995398,6177716.274317907,5569125957,1257452761,121,136.0,True,c,2,1882,0,9895000,9500000,8500000,79,0
1,201805,Rækkehus,København Ø,724612.6998841494,6177539.480294425,724694.7669242901,6177745.625006782,5569154296,1257385928,136,151.0,True,c,2,1882,1,9595000,9195000,8750000,96,0
2,202006,Villa,København S,726339.2701369224,6172005.419336407,726421.3337691108,6172211.5248479135,5564111328,1259671021,107,126.0,True,d,1,1927,0,4195000,3995000,3900000,148,0
3,202103,Villa,København S,726314.7447286966,6171972.668490626,7263968081.0,6172178.773859339,5564083099,125962944,140,136.0,True,a,2,2017,0,7495000,7495000,7595000,17,0
4,201108,Villa,Brønshøj,718735.9998927611,6178448.999985718,718818.0324674495,6178655.165567457,5570238375,1248127579,87,259.0,True,g,2,1928,0,2250000,2250000,2275000,15,0


# Prepare data for train_test_split

In [3]:
# For the moment we will not use the latitude and longitud
excluded_cols = ["erts89_utm32_x", "erts89_utm32_y", "ed50_x",
                 "ed50_y", "wgs84_lat", "wgs84_lon"]

#Drop those columns from the data set - IN CASE WE USE THEM DELETE THIS CELL
df = df.drop(excluded_cols, axis=1)




We need to check if there are any categorical features that need hot encoding for some of the machine learning models. 
One way to do this is by checking the data types of the columns:

In [4]:

#Find Categorical columns & save them in a variable
categorical_cols = [col for col in df.columns if df[col].dtype == "object"]

#Get dummies for those categorical columns
if categorical_cols:
    df = pd.get_dummies(df, columns=categorical_cols)

#Print the new shape of DF with dummies
print("This is the shape with the dummies:", df.shape)

This is the shape with the dummies: (57007, 37)


# Split data

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop("sold_price", axis=1), df["sold_price"], test_size=0.2, random_state=42)


# Scaled data

First, The standard scaler is created in case it lead to better results

In [6]:


# Creating the scaler variable
scaler = StandardScaler()
#the scaler needs to be trained only in the training data and not in the test data, so it needs to be fit in there.
# What this will do is to adapt to all the values from the training set and create the new parameters dividing 
# all the numbers by the Max number found in the trianig data set
scaler.fit(X_train)

# Once the scaler variable has all the relative numbers, i can transform all the features so my new X_train_scaled will 
# only have numbers potencially between 0-1. On the test data we might find numbers higher than one or just smaller since 
# it will be divided by the higher number in the trainig set
X_train_standard_scaled = scaler.transform(X_train)
X_test_standard_scaled = scaler.transform(X_test) 

In [7]:
# Default random forest model --> n_estimators = 10 ; max_depth = None ; max_features=”sqrt” and
# will state random_state=0 to reproduce the difference between nomal data and scaled data

#I create the classifier and fit it in the NON-scaled data as the first point of comparisson
rfclf = RandomForestClassifier(random_state=0).fit(X_train,y_train)

#Save the scores of the NON-scaled data 
normal_train_score=rfclf.score(X_train,y_train)
normal_test_score=rfclf.score(X_test,y_test)

print("The score for the NON-scaled data is:  Train score: {:.3f}  ; Test score: {:.3f}\n".format(normal_train_score,normal_test_score))


##I create the classifier and fit it in the SCALED data to compare with the first one. While fitting with the SCALED data, the target is not scaled
rfsclf = RandomForestClassifier(random_state=0).fit(X_train_standard_scaled,y_train)

#Save the scores of the SCALED data 
rfsclf_train_score=rfsclf.score(X_train_standard_scaled,y_train)
rfsclf_test_score=rfsclf.score(X_test_standard_scaled,y_test)

print("The score for the SCALED data is:  Train score: {:.3f}  ; Test score: {:.3f}\n".format(rfsclf_train_score,rfsclf_test_score))


###################### I SHOULD PLOT THE IN A GRAPH !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!