# Lyons Housing Data Set

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#load the data, change the file address

infile= "/content/drive/MyDrive/Colab Notebooks/Spring 2024/Machine Learning and Data Mining /Data/lyon_housing.csv"
lyon=pd.read_csv(infile)

In [4]:
lyon.head()

Unnamed: 0,date_transaction,type_achat,type_bien,nombre_pieces,surface_logement,surface_carrez_logement,surface_terrain,nombre_parkings,prix,adresse,commune,latitude,longitude,date_construction,anciennete
0,2019-10-31,ancien,maison,5,100.0,,247.0,0,530000.0,6 PAS DES ANTONINS,Villeurbanne,45.781673,4.879333,2003-06-11 11:38:24,16.387783
1,2018-11-26,ancien,maison,2,52.0,,156.0,0,328550.0,12 RUE DU LUIZET,Villeurbanne,45.78324,4.884683,2003-06-11 11:38:24,15.459633
2,2016-08-04,ancien,appartement,1,28.0,28.2,0.0,1,42500.0,4 RUE DE L ESPOIR,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.148839
3,2016-11-18,ancien,appartement,3,67.0,66.3,0.0,1,180900.0,6 RUE DE L ESPOIR,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.439058
4,2016-12-16,ancien,appartement,1,28.0,,0.0,1,97000.0,163 AV ROGER SALENGRO,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.515719


This data is from https://www.kaggle.com/benoitfavier/lyon-housing

type_achat-  ancien means existing house,  VEFA= sale prior to completion

type bien- house or apartment

nombre_pieces- probably number of rooms, use this

Surface_legement-interior space in square meeters

surface_carrez_logment-area with roof height under 1.8 m  (drop this variable)

surface_terrain- drop this

nombre_parkings- parking spots

prix- selling price,   predict this

anciennete- age of the property in years



Convert the dates into Pandas datetime variables, so we can extract the year of the build and the year of the sale

It is also possible to extract quarter of the year from the datetime variables,  or even months,  we could look for seasonality in prices if so inclined

Anyway, extract the year of the sale,   that is a categorical variable we will want

In [5]:
lyon['date_transaction']=pd.to_datetime(lyon['date_transaction'])

In [6]:
lyon['year_transaction']=lyon['date_transaction'].dt.year

In [7]:
lyon['date_construction']=pd.to_datetime(lyon['date_construction'])



In [8]:
lyon['year_construction']=lyon['date_construction'].dt.year

In [9]:
lyon.head()

Unnamed: 0,date_transaction,type_achat,type_bien,nombre_pieces,surface_logement,surface_carrez_logement,surface_terrain,nombre_parkings,prix,adresse,commune,latitude,longitude,date_construction,anciennete,year_transaction,year_construction
0,2019-10-31,ancien,maison,5,100.0,,247.0,0,530000.0,6 PAS DES ANTONINS,Villeurbanne,45.781673,4.879333,2003-06-11 11:38:24,16.387783,2019,2003
1,2018-11-26,ancien,maison,2,52.0,,156.0,0,328550.0,12 RUE DU LUIZET,Villeurbanne,45.78324,4.884683,2003-06-11 11:38:24,15.459633,2018,2003
2,2016-08-04,ancien,appartement,1,28.0,28.2,0.0,1,42500.0,4 RUE DE L ESPOIR,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.148839,2016,2003
3,2016-11-18,ancien,appartement,3,67.0,66.3,0.0,1,180900.0,6 RUE DE L ESPOIR,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.439058,2016,2003
4,2016-12-16,ancien,appartement,1,28.0,,0.0,1,97000.0,163 AV ROGER SALENGRO,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.515719,2016,2003


In [10]:
# how about the age of the property?
lyon['anciennete'].describe()

count    40516.000000
mean        21.246938
std          9.397379
min         -3.853563
25%         15.064690
50%         26.571388
75%         28.775403
max         31.494144
Name: anciennete, dtype: float64

# Convert from a continuous variable into categorical,  using Pandas cut

There is too much detail in the age of properties, use the cut function in pandas to convert this to a limited number of categories



In [11]:
temp=pd.cut(lyon.anciennete,bins=[-5,0,5,10,20,30,40],labels=['UnderConstruction','0-5','5-10','10-20','20-30','30+'])

In [12]:
lyon['age']=temp

In [13]:
lyon.head(3)

Unnamed: 0,date_transaction,type_achat,type_bien,nombre_pieces,surface_logement,surface_carrez_logement,surface_terrain,nombre_parkings,prix,adresse,commune,latitude,longitude,date_construction,anciennete,year_transaction,year_construction,age
0,2019-10-31,ancien,maison,5,100.0,,247.0,0,530000.0,6 PAS DES ANTONINS,Villeurbanne,45.781673,4.879333,2003-06-11 11:38:24,16.387783,2019,2003,10-20
1,2018-11-26,ancien,maison,2,52.0,,156.0,0,328550.0,12 RUE DU LUIZET,Villeurbanne,45.78324,4.884683,2003-06-11 11:38:24,15.459633,2018,2003,10-20
2,2016-08-04,ancien,appartement,1,28.0,28.2,0.0,1,42500.0,4 RUE DE L ESPOIR,Villeurbanne,45.781488,4.883474,2003-06-11 11:38:24,13.148839,2016,2003,10-20


Okay,  that's as far as I will go in addressing a couple of issues there,  your turn.

# Build your predictor of housing prices in Lyons

Predictors- use at least these variables

type_achat, type_bien, nombre_pieces,  surface_logement, nombre_parkings, commune(?), year_transaction (as a category,not an integer, age (category)
  
-one hot encode the categories
                                                                                                        
-standard scale the other data

-combine the standard-scaled and the onehot data into a pd Dataframe

-Build a neural net regressor,   a nearest neighbhor and a linear

-use some metrics,   what is the MSE?,  the R2,  the mean absolute value error?

-use cross validation to figure out which model seems to be best

-use EPI5 to understand what the most important predictors are
                                                                                                    
                                                                                                        

In [17]:
from sklearn.preprocessing import OneHotEncoder
# type_achat
encode_type_achat=OneHotEncoder()
mat_type_achat=encode_type_achat.fit_transform(lyon["type_achat"].to_numpy().reshape(-1,1))
df_mat_type_achat=pd.DataFrame(mat_type_achat.toarray(),columns=encode_type_achat.categories_[0])
df_mat_type_achat.head()

Unnamed: 0,VEFA,ancien
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [18]:
# type_bien
encode_type_bien=OneHotEncoder()
mat_type_bien=encode_type_bien.fit_transform(lyon["type_bien"].to_numpy().reshape(-1,1))
df_mat_type_bien=pd.DataFrame(mat_type_bien.toarray(),columns=encode_type_bien.categories_[0])
df_mat_type_bien.head()

Unnamed: 0,appartement,maison
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [19]:
# commune
encode_commune=OneHotEncoder()
mat_commune=encode_commune.fit_transform(lyon["commune"].to_numpy().reshape(-1,1))
df_mat_commune=pd.DataFrame(mat_commune.toarray(),columns=encode_commune.categories_[0])
df_mat_commune.head()

Unnamed: 0,Lyon 1er Arrondissement,Lyon 2e Arrondissement,Lyon 3e Arrondissement,Lyon 4e Arrondissement,Lyon 5e Arrondissement,Lyon 6e Arrondissement,Lyon 7e Arrondissement,Lyon 8e Arrondissement,Lyon 9e Arrondissement,Villeurbanne
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

# pipeline only the continuous data

lyon_continuous= num_pipeline.fit_transform(lyon[['nombre_pieces', 'surface_logement', 'nombre_parkings', 'year_transaction']])
# put the resulting numpy matrix back into a dataframe
lyon_continuous=pd.DataFrame(lyon_continuous,columns=['nombre_pieces', 'surface_logement', 'nombre_parkings', 'year_transaction'])

In [28]:
x = pd.concat([lyon_continuous,df_mat_type_achat,df_mat_type_bien, df_mat_commune], axis=1)
x

Unnamed: 0,nombre_pieces,surface_logement,nombre_parkings,year_transaction,VEFA,ancien,appartement,maison,Lyon 1er Arrondissement,Lyon 2e Arrondissement,Lyon 3e Arrondissement,Lyon 4e Arrondissement,Lyon 5e Arrondissement,Lyon 6e Arrondissement,Lyon 7e Arrondissement,Lyon 8e Arrondissement,Lyon 9e Arrondissement,Villeurbanne
0,1.870396,1.231567,-0.996559,0.534877,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.671310,-0.469113,-0.996559,-0.172704,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-1.518546,-1.319453,0.666260,-1.587867,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.175925,0.062349,0.666260,-1.587867,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.518546,-1.319453,0.666260,-1.587867,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40511,-0.671310,-1.106868,0.666260,1.242459,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40512,-0.671310,-1.142299,-0.996559,1.242459,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40513,-0.671310,-1.496607,-0.996559,1.242459,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40514,-0.671310,-1.106868,-0.996559,1.242459,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [35]:
y = np.ravel(lyon[['prix']].to_numpy())

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y,train_size=0.8,random_state=1)

### Build a neural net regressor,   a nearest neighbhor and a linear


#### Neural Net Regressor


In [43]:
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(hidden_layer_sizes=(6,3,),random_state=1, max_iter=50000, verbose=False)
regr.fit(X_train, y_train)
y_pred=regr.predict(X_train)

In [44]:
from sklearn.metrics import explained_variance_score

explained_variance_score(y_train,y_pred)

0.7414806561568871

In [45]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train, y_pred)

6178916751.856773

In [46]:
from sklearn.metrics import mean_absolute_percentage_error

mean_absolute_percentage_error(y_train, y_pred)

0.21836680689444832

In [47]:
from sklearn.metrics import r2_score
r2_score(y_train, y_pred)

0.7414806404427892

#### Linear Model

In [48]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train,y_train)
y_pred_lin=reg.predict(X_train)

In [49]:
explained_variance_score(y_train,y_pred_lin)

0.7140322991659994

#### KNN

In [50]:
from sklearn.neighbors import KNeighborsRegressor

neigh = KNeighborsRegressor(n_neighbors=8)
neigh.fit(X_train, y_train)

y_pred_nn=neigh.predict(X_train)

explained_variance_score(y_train,y_pred_nn)

0.7789657872613085

In [51]:
y_pred_nn=neigh.predict(X_train)

In [52]:
r2_score(y_train,y_pred_nn)

0.7787964541203236

### Use cross validation to figure out which model seems to be best

In [57]:
from sklearn.model_selection import cross_val_score

regr2 = MLPRegressor(hidden_layer_sizes=(6,3,),random_state=1, max_iter=50000, verbose=False)
scores = cross_val_score(regr, X_train, y_train, cv=10,scoring='r2')
print('Mean scores of Neural Network: ', np.mean(scores))
print('STD scores of Neural Network: ', np.std(scores))

Mean scores of Neural Network:  0.738259931332435
STD scores of Neural Network:  0.0321661992545159


In [58]:
reg2 = LinearRegression()
scores = cross_val_score(reg2, X_train, y_train, cv=10,scoring='r2')
print('Mean scores of Linear Regression: ', np.mean(scores))
print('STD scores of Linear Regression: ', np.std(scores))

Mean scores of Linear Regression:  0.7126226719291081
STD scores of Linear Regression:  0.031194997153574214


In [56]:
neigh2 = KNeighborsRegressor(n_neighbors=8)
scores = cross_val_score(neigh2, X_train, y_train, cv=10,scoring='r2')
print('Mean scores of KNN: ', np.mean(scores))
print('STD scores of KNN: ', np.std(scores))

Mean scores of KNN:  0.7125371435901112
STD scores of KNN:  0.030155742519688004


### Answer:
It appears that the NLP aggressor, also known as the neural network, has performed slightly better than the linear regressor and the K-nearest neighbors regressor. We can conclude this because the main score value is 0.73, whereas the other models had a score of 0.71.

It is also important to note that the K-nearest neighbors model appears to be overfitting to the training data. We can conclude this because the R-squared score decreases when it is tested against cross-validation data.

### -use EPI5 to understand what the most important predictors are


In [59]:
!pip install eli5

Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/216.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107717 sha256=55c8b64741bb480f7aec4e0fb15625cf640739863a4b4d219d5ba4bcddfe2a60
  Stored in directory: /root/.cache/pip/wheels/b8/58/ef/2cf4c306898c2338d51540e0922c8e0d6028e07007085c0004
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0


In [60]:
import eli5
from eli5.sklearn import PermutationImportance

In [64]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# MLPRegressor

model = make_pipeline(
    StandardScaler(),
    MLPRegressor(hidden_layer_sizes=(6,3,),random_state=1, max_iter=50000, verbose=False)
    )
model.fit(X_train,y_train)
perm = PermutationImportance(model, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = x.columns.tolist())

Weight,Feature
1.0285  ± 0.0145,surface_logement
0.0904  ± 0.0047,Villeurbanne
0.0701  ± 0.0026,year_transaction
0.0448  ± 0.0005,Lyon 6e Arrondissement
0.0432  ± 0.0032,Lyon 9e Arrondissement
0.0370  ± 0.0009,maison
0.0339  ± 0.0021,appartement
0.0260  ± 0.0010,Lyon 8e Arrondissement
0.0209  ± 0.0019,Lyon 5e Arrondissement
0.0208  ± 0.0010,VEFA


In [66]:
# LinearRegression

perm = PermutationImportance(reg, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = x.columns.tolist())

Weight,Feature
1.2698  ± 0.0129,surface_logement
0.0646  ± 0.0023,year_transaction
0.0587  ± 0.0017,Villeurbanne
0.0408  ± 0.0014,nombre_pieces
0.0354  ± 0.0009,Lyon 6e Arrondissement
0.0208  ± 0.0009,Lyon 9e Arrondissement
0.0174  ± 0.0014,VEFA
0.0170  ± 0.0014,ancien
0.0162  ± 0.0010,maison
0.0161  ± 0.0015,appartement


In [67]:
# KNeighborsRegressor
perm = PermutationImportance(neigh, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = x.columns.tolist())

Weight,Feature
0.7482  ± 0.0111,surface_logement
0.1074  ± 0.0038,year_transaction
0.0978  ± 0.0049,nombre_pieces
0.0800  ± 0.0057,nombre_parkings
0.0310  ± 0.0013,Villeurbanne
0.0221  ± 0.0014,Lyon 6e Arrondissement
0.0149  ± 0.0008,maison
0.0146  ± 0.0007,appartement
0.0115  ± 0.0005,VEFA
0.0112  ± 0.0005,ancien


### Answer:
It appears that the most important predictor of these models is the surface_logement