In [71]:
import pandas as pd
import statsmodels.api as sm
from math import dist
from sklearn.metrics import mean_squared_error

In [8]:
%run '/Users/louisebonhomme/Documents/GitHub/BDC/Helpers/preprocessing.py'

In [9]:
df = pd.read_csv('/Users/louisebonhomme/Documents/GitHub/BDC/Data/df_Paris_Processed.csv')

## Cleaning iris variables

In [10]:
clean_iris_codes(df)

In [11]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df['type'] = df.dtypes.to_numpy()
missing_df

Unnamed: 0,variable,missing values,filling factor (%),type
0,id_mutation,0,100.0,object
1,date_mutation,0,100.0,object
2,code_type_local,0,100.0,float64
3,code_commune,0,100.0,int64
4,surface_terrain,0,100.0,float64
5,surface_reelle_bati,0,100.0,float64
6,nombre_pieces_principales,0,100.0,float64
7,nature_culture,0,100.0,object
8,valeur_fonciere,0,100.0,float64
9,latitude,0,100.0,float64


In [12]:
path = "/Users/louisebonhomme/Documents/GitHub/BDC/Data/reference_IRIS_geo2022.xlsx"
dep = '75'
nb_iris(df, path = path, dep_code = dep)

Length of all Paris IRIS : 992
Length of IRIS in DF : 890
Number of IRIS not in DF : 102



## Process sur les données 
- ajout trimestres
- pour Paris : distance au métro le plus proche 
- Nombre d'appartements et de maisons par IRIS
- Nombre de biens en location ou occupés par propriétaire par IRIS
- split appartement / maison : appartement (prix/m2), maison (prix)
- split temporel

**Ajout trimestres**

In [13]:
annee_trimestre(df)

**IPL Q-1**

In [14]:
ipl = pd.read_csv('/Users/louisebonhomme/Documents/GitHub/BDC/Data/IPL.csv', sep = ';').iloc[3:,:2]
ipl.columns = ['date','IPL']
ipl["IPL"].astype('float64')
ipl['quarter'] = ipl['date'].apply(lambda x : x[:4]+'_Q'+x[-1])
ipl['IPL_{Q-1}'] = ipl['IPL'].shift(-1)
ipl.drop(['date'], axis = 1, inplace = True)

In [15]:
ipl

Unnamed: 0,IPL,quarter,IPL_{Q-1}
3,134.3,2022_Q3,130.8
4,130.8,2022_Q2,128.6
5,128.6,2022_Q1,127.5
6,127.5,2021_Q4,126.2
7,126.2,2021_Q3,122.5
8,122.5,2021_Q2,120.2
9,120.2,2021_Q1,119.3
10,119.3,2020_Q4,117.8
11,117.8,2020_Q3,115.5
12,115.5,2020_Q2,113.9


In [16]:
df = df.merge(ipl, on = 'quarter')

In [17]:
df[['IPL_{Q-1}']].isnull().sum(axis=0)

IPL_{Q-1}    0
dtype: int64

**Variables par IRIS**

In [20]:
logt_iris = pd.read_excel("/Users/louisebonhomme/Documents/GitHub/BDC/Data/base-ic-logement-2018.xlsx", header = 5)

In [22]:
logt_iris.columns

Index(['IRIS', 'REG', 'DEP', 'UU2020', 'COM', 'LIBCOM', 'TRIRIS', 'GRD_QUART',
       'LIBIRIS', 'TYP_IRIS', 'MODIF_IRIS', 'LAB_IRIS', 'P18_LOG', 'P18_RP',
       'P18_RSECOCC', 'P18_LOGVAC', 'P18_MAISON', 'P18_APPART', 'P18_MEN',
       'P18_PMEN', 'P18_RP_PROP', 'P18_RP_LOC', 'P18_RP_GARL'],
      dtype='object')

In [23]:
logt_iris = logt_iris[["IRIS", "P18_LOG", "P18_RP", "P18_RSECOCC", "P18_LOGVAC", "P18_MAISON", "P18_APPART", "P18_RP_PROP", "P18_RP_LOC"]]

In [28]:
logt_iris.columns = ["iris_code", "N_logements", "N_res_ppale", "N_res_second", "N_vacant", "N_maisons", 'N_apparts', "N_proprietaire", "N_locataire"]

In [34]:
logt_iris[['N_logements', 'N_res_ppale', "N_res_second", "N_vacant", "N_maisons", 'N_apparts', "N_proprietaire", "N_locataire"]] = round(logt_iris[['N_logements', 'N_res_ppale', "N_res_second", "N_vacant", "N_maisons", 'N_apparts', "N_proprietaire", "N_locataire"]])

In [36]:
df = df.merge(logt_iris, on = 'iris_code')

In [165]:
df['iris_code'].unique()

array(['751031102', '751083203', '751082903', '751010201', '751031202',
       '751010202', '751083005', '751010204', '751083002', '751030901',
       '751031204', '751083001', '751031104', '751031001', '751010203',
       '751010301', '751010401', '751030903', '751083004', '751083105',
       '751083106', '751083003', '751083104', '751083102', '751030905',
       '751083207', '751031103', '751083103', '751083201', '751031003',
       '751083209', '751031201', '751082901', '751031101', '751030904',
       '751010402', '751083202', '751083206', '751083205', '751083107',
       '751010303', '751031002', '751082902', '751082904', '751083204',
       '751010102', '751031004', '751010103', '751030902', '751031203',
       '751010101', '751103706', '751062305', '751051805', '751051804',
       '751072506', '751062203', '751072604', '751051902', '751062402',
       '751062308', '751051903', '751062101', '751051901', '751062103',
       '751072805', '751051801', '751072505', '751051704', '7510

**Ajout distance métro le plus proche**

**ATTENTION : FONCTIONNE QUE POUR PARIS!**

In [40]:
metros = pd.read_csv('/Users/louisebonhomme/Documents/GitHub/BDC/Data/df_metro_paris_spatial.csv')

In [42]:
metros['iris_code'] = metros['iris_code'].str[2:11]

In [184]:
df['Arrondissement'] = df['iris_code'].str[:5]
metros['Arrondissement'] = metros['iris_code'].str[:5]

In [276]:
def num_dist_metro(test) : 
    metro_iris = metros[metros['iris_code'] == test.iris_code]
    metro_arr = metros[metros['Arrondissement'] == test.Arrondissement]
    #print(metro_iris.shape[0])
    N_metros_iris = metro_iris.shape[0]
    N_metros_arr = metro_arr.shape[0]

    distances = []
    
    if N_metros_iris > 0: 
        for i in range(metro_iris.shape[0]) : 
            latitude_metro = metro_iris['Latitude'].iloc[i]
            longitude_metro = metro_iris['Longitude'].iloc[i]
            point_metro = [latitude_metro, longitude_metro]
            point_bien = [test.latitude, test.longitude]
            distances.append(dist(point_bien, point_metro))
    else :
        for i in range(metro_arr.shape[0]) : 
        
            latitude_metro = metro_arr['Latitude'].iloc[i]
            longitude_metro = metro_arr['Longitude'].iloc[i]
            point_metro = [latitude_metro, longitude_metro]
            point_bien = [test.latitude, test.longitude]
            distances.append(dist(point_bien, point_metro))
    dist_metro = np.min(distances)
    
    return(N_metros_iris, distances)

In [278]:
for i in range(metro_arr.shape[0]) : 
    latitude_metro = metro_arr['Latitude'].iloc[i]
    longitude_metro = metro_arr['Longitude'].iloc[i]
    point_metro = [latitude_metro, longitude_metro]
    point_bien = [test.latitude, test.longitude]
    distances.append(dist(point_bien, point_metro))
dist_metro = np.min(distances)

In [285]:
N_metros_list = []
N_metros_list_arr = []
dist_metros_list = []
error = []
for i in range(df.shape[0]):
    logt = df[['latitude', 'longitude', 'iris_code', 'Arrondissement']].iloc[i,:]
    N_metros_list.append(num_dist_metro(logt)[0])
    dist_metros_list.append(num_dist_metro(logt)[1])


KeyboardInterrupt



In [None]:
df['n_metros_iris'] = N_metros_list
df['dist_metro'] = dist_metros_list

In [None]:
df.head()

**Split appart/maison**

In [17]:
df_maison , df_appart = split_appart_maison(df)

Nombre de maisons : 493
Nombre d'apparts : 144048


### Split données 

There are 20 quarters 

In [242]:
missing_df = df_appart.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df_appart.shape[0]-missing_df['missing values'])/df_appart.shape[0]*100
missing_df['type'] = df_appart.dtypes.to_numpy()
missing_df

Unnamed: 0,variable,missing values,filling factor (%),type
0,id_mutation,0,100.0,object
1,date_mutation,0,100.0,datetime64[ns]
2,code_type_local,0,100.0,float64
3,code_commune,0,100.0,int64
4,surface_terrain,0,100.0,float64
5,surface_reelle_bati,0,100.0,float64
6,nombre_pieces_principales,0,100.0,float64
7,nature_culture,0,100.0,object
8,valeur_fonciere,0,100.0,float64
9,latitude,0,100.0,float64


In [243]:
df_train = df_appart[df_appart['date_mutation'] < '2021-04-01']
df_test = df_appart[df_appart['date_mutation'] >= '2021-04-01']

print(f'Train size : {round(df_train.shape[0]/df.shape[0], 2)*100}%')
print(f'Test size : {round(df_test.shape[0]/df.shape[0], 2)*100}%')

Train size : 73.0%
Test size : 26.0%


**Régression OLS**

Variables utilisées : 
- Y = prix/m2
- surface_terrain ?
- surface_reelle_bati
- nombre_pieces_principales
- ajouter Y_t-1 ? 

In [248]:
Features = ['surface_terrain', 'surface_reelle_bati', 'nombre_pieces_principales', 'IPL_{Q-1}']
Label = 'Prix_m2'

Y_train = df_train[Label]
X_train = df_train[Features]

Y_test = df_test[Label]
X_test = df_test[Features]

In [249]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model = sm.OLS(Y_train, X_train).fit()

In [250]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                Prix_m2   R-squared:                       0.064
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     1814.
Date:                Tue, 07 Mar 2023   Prob (F-statistic):               0.00
Time:                        11:42:46   Log-Likelihood:            -1.4656e+05
No. Observations:              106171   AIC:                         2.931e+05
Df Residuals:                  106166   BIC:                         2.932e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

In [254]:
mean_squared_error(Y_test, model.predict(X_test))

1.1070087458229645