In [359]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [360]:
# Importing the datasets
df_good = pd.read_csv('data/Axa/good.csv')
df_eval = pd.read_csv('data/Axa/eval.csv')

In [361]:
# Count NA values per col
df_good[:].isna().sum()

id                    0
date                  0
prix                  0
ville                 0
nombre_lots           0
lot1_surface1       496
lot2_surface1       886
lot3_surface1      1005
lot4_surface1      1015
lot5_surface1      1018
type                  0
surface2            159
nombre_pieces         0
nature_culture      849
surface_terrain     849
dtype: int64

In [362]:
# Head values
df_good.head(10)

Unnamed: 0,id,date,prix,ville,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,type,surface2,nombre_pieces,nature_culture,surface_terrain
0,1,2015-01-16,217000.0,B,0,,,,,,Maison,147.0,5.0,sols,704.0
1,2,2017-09-08,1691000.0,C,1,65.09,,,,,Appartement,71.0,2.0,,
2,3,2018-12-05,43000.0,B,2,50.35,,,,,Appartement,50.0,2.0,,
3,4,2018-08-09,139330.0,B,0,,,,,,Maison,53.0,3.0,sols,235.0
4,5,2016-02-04,665000.0,D,2,71.94,,,,,Appartement,75.0,4.0,,
5,6,2019-07-03,485000.0,D,2,39.02,,,,,Appartement,39.0,2.0,,
6,7,2018-07-06,81900.0,B,2,40.94,,,,,Appartement,42.0,2.0,,
7,8,2017-12-27,1880000.0,D,3,192.08,,,,,Appartement,192.0,7.0,,
8,9,2019-10-17,10000.0,D,1,,,,,,Dépendance,,0.0,,
9,10,2016-08-11,145800.0,A,0,,,,,,Maison,188.0,5.0,sols,188.0


In [363]:
# Show unique values in nature_culture
df_good['nature_culture'].unique()

array(['sols', nan], dtype=object)

In [364]:
# Replace sols with 1 and nan with 0
df_good["nature_culture"].replace({"sols": 1, np.nan: 0}, inplace=True)
df_good['nature_culture'].unique()

array([1, 0], dtype=int64)

In [365]:
# Taking care of missing data
from sklearn.impute import SimpleImputer
imp_zeros = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
nan_to_zeros = ['lot1_surface1', 'lot2_surface1', 'lot3_surface1', 'lot3_surface1', 'lot4_surface1', 'lot5_surface1', 'surface2', 'surface_terrain']
df_good.loc[:, nan_to_zeros] = imp_zeros.fit_transform(df_good.loc[:, nan_to_zeros])
df_good[:].isna().sum()

id                 0
date               0
prix               0
ville              0
nombre_lots        0
lot1_surface1      0
lot2_surface1      0
lot3_surface1      0
lot4_surface1      0
lot5_surface1      0
type               0
surface2           0
nombre_pieces      0
nature_culture     0
surface_terrain    0
dtype: int64

In [366]:
# Head values
df_good.head()

Unnamed: 0,id,date,prix,ville,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,type,surface2,nombre_pieces,nature_culture,surface_terrain
0,1,2015-01-16,217000.0,B,0,0.0,0.0,0.0,0.0,0.0,Maison,147.0,5.0,1,704.0
1,2,2017-09-08,1691000.0,C,1,65.09,0.0,0.0,0.0,0.0,Appartement,71.0,2.0,0,0.0
2,3,2018-12-05,43000.0,B,2,50.35,0.0,0.0,0.0,0.0,Appartement,50.0,2.0,0,0.0
3,4,2018-08-09,139330.0,B,0,0.0,0.0,0.0,0.0,0.0,Maison,53.0,3.0,1,235.0
4,5,2016-02-04,665000.0,D,2,71.94,0.0,0.0,0.0,0.0,Appartement,75.0,4.0,0,0.0


In [367]:
df_good['année'] = pd.to_datetime(df_good['date']).dt.year
df_good['mois'] = pd.to_datetime(df_good['date']).dt.month
df_good['jour'] = pd.to_datetime(df_good['date']).dt.day
df_good = df_good.drop(['date', 'id'], axis=1)

In [368]:
df_good.head()

Unnamed: 0,prix,ville,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,type,surface2,nombre_pieces,nature_culture,surface_terrain,année,mois,jour
0,217000.0,B,0,0.0,0.0,0.0,0.0,0.0,Maison,147.0,5.0,1,704.0,2015,1,16
1,1691000.0,C,1,65.09,0.0,0.0,0.0,0.0,Appartement,71.0,2.0,0,0.0,2017,9,8
2,43000.0,B,2,50.35,0.0,0.0,0.0,0.0,Appartement,50.0,2.0,0,0.0,2018,12,5
3,139330.0,B,0,0.0,0.0,0.0,0.0,0.0,Maison,53.0,3.0,1,235.0,2018,8,9
4,665000.0,D,2,71.94,0.0,0.0,0.0,0.0,Appartement,75.0,4.0,0,0.0,2016,2,4


In [369]:
# Encoding categorical data
dummies = pd.get_dummies(df_good['ville'], prefix='ville')
df_good[dummies.columns] = dummies
df_good = df_good.drop(['ville'], axis=1)
df_good.head()

Unnamed: 0,prix,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,type,surface2,nombre_pieces,nature_culture,surface_terrain,année,mois,jour,ville_A,ville_B,ville_C,ville_D
0,217000.0,0,0.0,0.0,0.0,0.0,0.0,Maison,147.0,5.0,1,704.0,2015,1,16,0,1,0,0
1,1691000.0,1,65.09,0.0,0.0,0.0,0.0,Appartement,71.0,2.0,0,0.0,2017,9,8,0,0,1,0
2,43000.0,2,50.35,0.0,0.0,0.0,0.0,Appartement,50.0,2.0,0,0.0,2018,12,5,0,1,0,0
3,139330.0,0,0.0,0.0,0.0,0.0,0.0,Maison,53.0,3.0,1,235.0,2018,8,9,0,1,0,0
4,665000.0,2,71.94,0.0,0.0,0.0,0.0,Appartement,75.0,4.0,0,0.0,2016,2,4,0,0,0,1


In [370]:
# Encoding categorical data
dummies = pd.get_dummies(df_good['type'], prefix='type')
df_good[dummies.columns] = dummies
df_good = df_good.drop(['type'], axis=1)
df_good.head()

Unnamed: 0,prix,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,surface2,nombre_pieces,nature_culture,...,mois,jour,ville_A,ville_B,ville_C,ville_D,type_Appartement,type_Dépendance,type_Local industriel. commercial ou assimilé,type_Maison
0,217000.0,0,0.0,0.0,0.0,0.0,0.0,147.0,5.0,1,...,1,16,0,1,0,0,0,0,0,1
1,1691000.0,1,65.09,0.0,0.0,0.0,0.0,71.0,2.0,0,...,9,8,0,0,1,0,1,0,0,0
2,43000.0,2,50.35,0.0,0.0,0.0,0.0,50.0,2.0,0,...,12,5,0,1,0,0,1,0,0,0
3,139330.0,0,0.0,0.0,0.0,0.0,0.0,53.0,3.0,1,...,8,9,0,1,0,0,0,0,0,1
4,665000.0,2,71.94,0.0,0.0,0.0,0.0,75.0,4.0,0,...,2,4,0,0,0,1,1,0,0,0


In [371]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
to_normalize = ['prix', 'nombre_lots', 'lot1_surface1', 'lot2_surface1', 'lot3_surface1', 'lot4_surface1', 'lot5_surface1', 'surface2', 'nombre_pieces', 'nature_culture', 'surface_terrain', 'année', 'mois', 'jour']
df_good[to_normalize] = sc.fit_transform(df_good[to_normalize])
# prédiction : 
# y_pred = sc_y.inverse_transform(model.predict(sc_X.transform(np.array([[5.6]]))))
df_good.describe()

Unnamed: 0,prix,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,surface2,nombre_pieces,nature_culture,...,mois,jour,ville_A,ville_B,ville_C,ville_D,type_Appartement,type_Dépendance,type_Local industriel. commercial ou assimilé,type_Maison
count,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,...,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0
mean,6.275647e-17,-5.2297060000000006e-17,-2.396949e-17,6.275647e-17,2.7891770000000004e-17,6.972941000000001e-18,3.486471e-18,-4.881059e-17,1.3945880000000003e-17,-4.1837650000000003e-17,...,2.7020150000000003e-17,-2.004721e-16,0.052012,0.274779,0.206084,0.467125,0.612365,0.156035,0.07262,0.158979
std,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,...,1.000491,1.000491,0.22216,0.446622,0.40469,0.499163,0.48745,0.363067,0.25964,0.365836
min,-0.5017155,-1.12708,-0.6704556,-0.3295574,-0.09538664,-0.05586579,-0.03134196,-0.9714145,-1.120234,-0.4474769,...,-1.64386,-1.796273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.4287038,-0.3043763,-0.6704556,-0.3295574,-0.09538664,-0.05586579,-0.03134196,-0.5879002,-0.6489217,-0.4474769,...,-0.777582,-0.8934854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.3031221,-0.3043763,-0.4929254,-0.3295574,-0.09538664,-0.05586579,-0.03134196,-0.2043859,-0.1776094,-0.4474769,...,0.08869648,0.122151,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.06534836,0.5183278,0.4184266,-0.3295574,-0.09538664,-0.05586579,-0.03134196,0.2976692,0.7650154,-0.4474769,...,0.9549749,0.9120905,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
max,16.02348,16.14971,5.830248,8.413801,18.32209,24.31503,31.90611,8.316605,6.420764,2.234752,...,1.532494,1.589181,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [372]:
def process_dataframe(df_to_process):
    # Replace sols with 1 and nan with 0
    df_to_process["nature_culture"].replace({"sols": 1, np.nan: 0}, inplace=True)
    df_to_process['nature_culture'].unique()
    # Taking care of missing data
    from sklearn.impute import SimpleImputer
    imp_zeros = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
    nan_to_zeros = ['lot1_surface1', 'lot2_surface1', 'lot3_surface1', 'lot3_surface1', 'lot4_surface1', 'lot5_surface1', 'surface2', 'surface_terrain']
    df_to_process.loc[:, nan_to_zeros] = imp_zeros.fit_transform(df_to_process.loc[:, nan_to_zeros])
    df_to_process[:].isna().sum()
    # replace date
    df_to_process['année'] = pd.to_datetime(df_to_process['date']).dt.year
    df_to_process['mois'] = pd.to_datetime(df_to_process['date']).dt.month
    df_to_process['jour'] = pd.to_datetime(df_to_process['date']).dt.day
    df_to_process = df_to_process.drop(['date', 'id'], axis=1)
    # Encoding categorical data
    dummies = pd.get_dummies(df_to_process['ville'], prefix='ville')
    df_to_process[dummies.columns] = dummies
    df_to_process = df_to_process.drop(['ville'], axis=1)
    df_to_process.head()
    # Encoding categorical data
    dummies = pd.get_dummies(df_to_process['type'], prefix='type')
    df_to_process[dummies.columns] = dummies
    df_to_process = df_to_process.drop(['type'], axis=1)
    df_to_process.head()
    # scale
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    to_normalize = ['prix', 'nombre_lots', 'lot1_surface1', 'lot2_surface1', 'lot3_surface1', 'lot4_surface1', 'lot5_surface1', 'surface2', 'nombre_pieces', 'nature_culture', 'surface_terrain', 'année', 'mois', 'jour']
    df_to_process[to_normalize] = sc.fit_transform(df_to_process[to_normalize])
    # prédiction : 
    # y_pred = sc_y.inverse_transform(model.predict(sc_X.transform(np.array([[5.6]]))))
    df_to_process.describe()
    return df_to_process

In [373]:
df_eval_bis = process_dataframe(df_eval)

In [374]:
df_eval_bis[:].isna().sum()

prix                                             0
nombre_lots                                      0
lot1_surface1                                    0
lot2_surface1                                    0
lot3_surface1                                    0
lot4_surface1                                    0
lot5_surface1                                    0
surface2                                         0
nombre_pieces                                    0
nature_culture                                   0
surface_terrain                                  0
année                                            0
mois                                             0
jour                                             0
ville_A                                          0
ville_B                                          0
ville_C                                          0
ville_D                                          0
type_Appartement                                 0
type_Dépendance                

In [375]:
df_good.describe()

Unnamed: 0,prix,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,surface2,nombre_pieces,nature_culture,...,mois,jour,ville_A,ville_B,ville_C,ville_D,type_Appartement,type_Dépendance,type_Local industriel. commercial ou assimilé,type_Maison
count,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,...,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0,1019.0
mean,6.275647e-17,-5.2297060000000006e-17,-2.396949e-17,6.275647e-17,2.7891770000000004e-17,6.972941000000001e-18,3.486471e-18,-4.881059e-17,1.3945880000000003e-17,-4.1837650000000003e-17,...,2.7020150000000003e-17,-2.004721e-16,0.052012,0.274779,0.206084,0.467125,0.612365,0.156035,0.07262,0.158979
std,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,1.000491,...,1.000491,1.000491,0.22216,0.446622,0.40469,0.499163,0.48745,0.363067,0.25964,0.365836
min,-0.5017155,-1.12708,-0.6704556,-0.3295574,-0.09538664,-0.05586579,-0.03134196,-0.9714145,-1.120234,-0.4474769,...,-1.64386,-1.796273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.4287038,-0.3043763,-0.6704556,-0.3295574,-0.09538664,-0.05586579,-0.03134196,-0.5879002,-0.6489217,-0.4474769,...,-0.777582,-0.8934854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.3031221,-0.3043763,-0.4929254,-0.3295574,-0.09538664,-0.05586579,-0.03134196,-0.2043859,-0.1776094,-0.4474769,...,0.08869648,0.122151,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.06534836,0.5183278,0.4184266,-0.3295574,-0.09538664,-0.05586579,-0.03134196,0.2976692,0.7650154,-0.4474769,...,0.9549749,0.9120905,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
max,16.02348,16.14971,5.830248,8.413801,18.32209,24.31503,31.90611,8.316605,6.420764,2.234752,...,1.532494,1.589181,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [376]:
y = df_good['prix']
X = df_good.drop(['prix'], axis=1)

In [377]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [378]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
# Predicting the Test set results
y_pred = lin_reg.predict(X_test)
# Visualising the Training set results
print("train", lin_reg.score(X_train, y_train))
print("test", lin_reg.score(X_test, y_test))
print("MSE", mean_squared_error(y_pred, y_test))

train 0.6094145654409306
test 0.6010422812469354
MSE 0.6387006353951878


In [379]:
# Fitting SVR to the dataset
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
# Visualising the Training set results
print("train", lin_reg.score(X_train, y_train))
print("test", lin_reg.score(X_test, y_test))
print("MSE", mean_squared_error(y_pred, y_test))

train 0.6094145654409306
test 0.6010422812469354
MSE 1.0671170993733896


In [380]:
# Decision tree
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
# Visualising the Training set results
print("train", lin_reg.score(X_train, y_train))
print("test", lin_reg.score(X_test, y_test))
print("MSE", mean_squared_error(y_pred, y_test))

train 0.6094145654409306
test 0.6010422812469354
MSE 0.5926411552866548


In [381]:
# Random forest 
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
# Visualising the Training set results
print("train", lin_reg.score(X_train, y_train))
print("test", lin_reg.score(X_test, y_test))
print("MSE", mean_squared_error(y_pred, y_test))

train 0.6094145654409306
test 0.6010422812469354
MSE 0.4458085281824792


In [382]:
# On fait avec eval
X = df_good.drop(['prix'], axis=1)
y = df_eval_bis['prix']

In [383]:
y_pred = regressor.predict(X)

In [384]:
ecarts = (y_pred - y) ** 2

ValueError: operands could not be broadcast together with shapes (1019,) (1000,) 

In [259]:
idx = np.array(ecarts).argsort()[-50:][::-1]
idx

array([522, 610, 750, 997, 458, 929, 732, 665,  24, 578, 437, 504, 991,
       937, 664, 479, 947, 875, 374, 529, 521, 884, 440, 535, 622, 527,
       930, 370, 436, 655, 876, 434, 774, 649, 352, 911, 192, 475, 523,
       629, 418, 672, 915, 524, 694, 640, 900, 536, 872, 507], dtype=int64)

In [353]:
df_eval["fake"] = False
df.loc[idx,'fake'] = True
df_eval.head()

Unnamed: 0,id,date,prix,ville,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,type,surface2,nombre_pieces,nature_culture,surface_terrain,année,mois,jour,fake
0,1020,2019-12-20,525000.0,A,0,0.0,0.0,0.0,0.0,0.0,Maison,77.0,4.0,1,455.0,2019,12,20,False
1,1021,2019-04-19,78000.0,A,0,0.0,0.0,0.0,0.0,0.0,Maison,57.0,3.0,1,806.0,2019,4,19,False
2,1022,2018-09-04,85000.0,A,0,0.0,0.0,0.0,0.0,0.0,Maison,72.0,3.0,1,459.0,2018,9,4,False
3,1023,2018-03-23,45000.0,A,0,0.0,0.0,0.0,0.0,0.0,Maison,98.0,5.0,1,260.0,2018,3,23,False
4,1024,2018-03-16,82500.0,A,0,0.0,0.0,0.0,0.0,0.0,Maison,70.0,4.0,1,160.0,2018,3,16,False


In [358]:
df_eval[df_eval["fake"] == True]

Unnamed: 0,id,date,prix,ville,nombre_lots,lot1_surface1,lot2_surface1,lot3_surface1,lot4_surface1,lot5_surface1,type,surface2,nombre_pieces,nature_culture,surface_terrain,année,mois,jour,fake
24,1044,2018-12-14,40000.0,A,1,198.0,0.0,0.0,0.0,0.0,Local industriel. commercial ou assimilé,237.0,0.0,0,0.0,2018,12,14,True
192,1212,2015-07-31,200000.0,B,1,121.91,0.0,0.0,0.0,0.0,Local industriel. commercial ou assimilé,789.0,0.0,0,0.0,2015,7,31,True
352,1372,2018-12-21,280000.0,C,1,102.5,0.0,0.0,0.0,0.0,Appartement,110.0,2.0,0,0.0,2018,12,21,True
370,1390,2019-10-30,475000.0,C,5,37.73,0.0,0.0,0.0,0.0,Appartement,200.0,2.0,0,0.0,2019,10,30,True
374,1394,2018-10-30,2600000.0,C,2,209.74,0.0,0.0,0.0,0.0,Appartement,208.0,6.0,0,0.0,2018,10,30,True
418,1438,2017-02-02,2145000.0,C,3,242.44,0.0,0.0,0.0,0.0,Appartement,250.0,6.0,0,0.0,2017,2,2,True
434,1454,2019-09-20,4275000.0,C,2,280.26,0.0,0.0,0.0,0.0,Local industriel. commercial ou assimilé,290.0,0.0,0,0.0,2019,9,20,True
436,1456,2019-06-20,1.0,C,1,117.84,0.0,0.0,0.0,0.0,Local industriel. commercial ou assimilé,151.0,0.0,0,0.0,2019,6,20,True
437,1457,2019-05-27,120000.0,C,2,0.0,0.0,0.0,0.0,0.0,Local industriel. commercial ou assimilé,328.0,0.0,0,0.0,2019,5,27,True
440,1460,2018-12-04,1538000.0,C,1,170.18,0.0,0.0,0.0,0.0,Local industriel. commercial ou assimilé,181.0,0.0,0,0.0,2018,12,4,True
