In [2]:
%matplotlib inline
import geopandas as gpd
import pandas as pd
import random
from shapely.geometry import Point
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [4]:
crs = {'init': 'epsg:4326'}


In [5]:
## college & universities
# load data
college = 'C:/Users/cleme/Documents/ENSAE/2A/S1/Python/Projet/Donnees/original_collegeanduniversity.shp'
df_college = gpd.read_file(college).to_crs(crs)
#build column based of the geo points
col_college = gpd.GeoSeries(df_college.geometry)
col_college.head()


0    POINT (-73.99465215457163 40.73519616365903)
1    POINT (-73.99706966379965 40.73546280987431)
2    POINT (-74.00681944352681 40.72344185905749)
3    POINT (-73.79439300079635 40.73944287003665)
4    POINT (-73.84872054010768 40.72137188695677)
Name: geometry, dtype: object

In [7]:

## health facilities
#load data
hospital = 'C:/Users/cleme/Documents/ENSAE/2A/S1/Python/Projet/Donnees/original_hospitals.csv'
df_hospital = pd.read_csv(hospital, sep=";")
#extract long and lat from location 1 column in the df 
df_hospital['position'] = df_hospital['Location 1'].str.replace(r'[^(]*\(|\)[^)]*', '')
df_hospital['newlongitude'] = df_hospital['position'].str.replace(r'[^,]*\,', '')
df_hospital['newlatitude']= df_hospital['position'].str.replace(r'\,[^,]*', '')
#convert them as a float and then as geo points to build the specific column
df_hospital['newlatitude'] = df_hospital['newlatitude'].astype(float)
df_hospital['newlongitude'] = df_hospital['newlongitude'].astype(float)
df_hospital['geometry'] = gpd.GeoSeries([Point(xy) for xy in zip(df_hospital.newlongitude, df_hospital.newlatitude)])
geodf_hospital = gpd.GeoDataFrame(df_hospital,crs=crs)
geodf_hospital = geodf_hospital.set_geometry("geometry")

In [8]:

##Points of Interest
#load data
PoI = 'C:/Users/cleme/Documents/ENSAE/2A/S1/Python/Projet/Donnees/original_PointOfInterest.shp'
df_PoI = gpd.read_file(PoI).to_crs(crs)

In [9]:
 #recensement des abréviations
ab_religion = pd.Series(['religion','CATHEDRAL','SYNAGOGUE','EPISCOPAL', 'TEMPLE', 'CHUR','TMPL','TABERNACLE',' CH ', 'CATHDL', 'CHAPEL','CONGREGAT','CONGEGRATION', 'EVANGICAL LUTH','ISRAEL'])
ab_consulate = pd.Series(['consulate','CONSULATE'])
ab_park= pd.Series(['park','PARK','PLGD','PLAYGROUND', 'PLAYGRND', 'RECREATION', 'SQUARE', 'GARDEN'])
ab_theatre = pd.Series(['theatre','THEATRE', 'THTR'])
ab_school = pd.Series(['school','HS','HIGH SCHOOL', 'SCHL', ' SC ','SCHOOL','ACADEMY'])
ab_library = pd.Series(['library','LIBRARY'])
ab_daycare = pd.Series(['daycare','DAY CARE','DAYCARE','NURSERY','NURSING','NURSIG'])
ab_cemetery = pd.Series (['cemetry','CEMETERY','CMTRY'])


In [10]:
#on vire les nan de la colonne name car impossible de les catégoriser
df_PoI=df_PoI.dropna(subset=['name'])
df_PoI.reset_index(drop=True, inplace=True)

In [11]:
# Création des dummy variables
j=0
df_PoI['dum_religion']=0
df_PoI['dum_consulate']=0
df_PoI['dum_park']=0
df_PoI['dum_theatre']=0
df_PoI['dum_school']=0
df_PoI['dum_library']=0
df_PoI['dum_daycare']=0
df_PoI['dum_cemetery']=0

In [12]:

#we create dummy variables to classify the points of interest


while j < (df_PoI.shape[0] - 1) :
    i=0
    while ( i < ab_religion.size and df_PoI.loc[j,'dum_religion']==0 ):
            if ab_religion.loc[i] in df_PoI.loc[j,'name']:
                df_PoI.set_value(j,'dum_religion', 1)
 #         print('On recherche ', i,' dans ', j, '. Limit is ', ab_religion.size)
            i+=1    
     
    i=0
    while ( i < ab_consulate.size and df_PoI.loc[j,'dum_consulate']==0 and df_PoI.loc[j,'dum_religion']==0 ):
            if ab_consulate.loc[i] in df_PoI.loc[j,'name']:
                df_PoI.set_value(j,'dum_consulate', 1)
#          print('On recherche ', i,' dans ', j, '. Limit is ', ab_consulate.size)
            i+=1           
          
    i=0 
    while ( i < ab_park.size and df_PoI.loc[j,'dum_park']==0and df_PoI.loc[j,'dum_consulate']==0  and df_PoI.loc[j,'dum_religion']==0 ):
            if ab_park.loc[i] in df_PoI.loc[j,'name']:
                df_PoI.set_value(j,'dum_park', 1)
 #         print('On recherche ', i,' dans ', j, '. Limit is ', ab_park.size)
            i+=1
    
    i=0 
    while ( i < ab_theatre.size and df_PoI.loc[j,'dum_theatre']==0 and df_PoI.loc[j,'dum_park']==0 and df_PoI.loc[j,'dum_consulate']==0 and df_PoI.loc[j,'dum_religion']==0 ):
          if ab_theatre.loc[i] in df_PoI.loc[j,'name']:
                df_PoI.set_value(j,'dum_theatre', 1)
#          print('On recherche ', i,' dans ', j, '. Limit is ', ab_theatre.size)
          i+=1
          
    i=0
    while ( i < ab_school.size and df_PoI.loc[j,'dum_school']==0 and df_PoI.loc[j,'dum_theatre']==0and df_PoI.loc[j,'dum_park']==0 and df_PoI.loc[j,'dum_consulate']==0 and df_PoI.loc[j,'dum_religion']==0 ):
          if ab_school.loc[i] in df_PoI.loc[j,'name']:
                df_PoI.set_value(j,'dum_school', 1)
#          print('On recherche ', i,' dans ', j, '. Limit is ', ab_school.size)
          i+=1


    i=0 
    while ( i < ab_library.size and df_PoI.loc[j,'dum_library']==0 and df_PoI.loc[j,'dum_school']==0 and df_PoI.loc[j,'dum_theatre']==0and df_PoI.loc[j,'dum_park']==0and df_PoI.loc[j,'dum_consulate']==0 and df_PoI.loc[j,'dum_religion']==0 ):
          if ab_library.loc[i] in df_PoI.loc[j,'name']:
                df_PoI.set_value(j,'dum_library', 1)
#          print('On recherche ', i,' dans ', j, '. Limit is ', ab_library.size)
          i+=1
          
    i=0 
    while ( i < ab_daycare.size and df_PoI.loc[j,'dum_daycare']==0 and df_PoI.loc[j,'dum_library']==0 and df_PoI.loc[j,'dum_school']==0 and df_PoI.loc[j,'dum_theatre']==0 and df_PoI.loc[j,'dum_park']==0 and df_PoI.loc[j,'dum_consulate']==0 and df_PoI.loc[j,'dum_religion']==0 ):
        if ab_daycare.loc[i] in df_PoI.loc[j,'name']:
            df_PoI.set_value(j,'dum_daycare', 1)
#          print('On recherche ', i,' dans ', j, '. Limit is ', ab_daycare.size)
        i+=1          
          
          
    i=0 
    while ( i < ab_cemetery.size and df_PoI.loc[j,'dum_cemetery']==0 and df_PoI.loc[j,'dum_daycare']==0 and df_PoI.loc[j,'dum_library']==0 and df_PoI.loc[j,'dum_school']==0 and df_PoI.loc[j,'dum_theatre']==0and df_PoI.loc[j,'dum_park']==0and df_PoI.loc[j,'dum_consulate']==0 and df_PoI.loc[j,'dum_religion']==0 ):
        if ab_cemetery.loc[i] in df_PoI.loc[j,'name']:
            df_PoI.set_value(j,'dum_cemetery', 1)
#          print('On recherche ', i,' dans ', j, '. Limit is ', ab_cemetery.size)
        i+=1          
          
    j+=1
     
print("sortie de la boucle")

sortie de la boucle


In [13]:
#lieux de culte
df_religion=df_PoI[df_PoI.dum_religion!=0]
df_religion.reset_index(drop=True, inplace=True)
col_religion = gpd.GeoSeries(df_religion.geometry)
col_religion.head()

0    POINT (-73.93634786975662 40.68091615244033)
1    POINT (-73.93569488141881 40.75255340482011)
2    POINT (-73.91828483362288 40.65080302570205)
3    POINT (-74.00674292455398 40.62742678200225)
4    POINT (-73.94597769590463 40.80647201947438)
Name: geometry, dtype: object

In [14]:
#consulate
df_consulate=df_PoI[df_PoI.dum_consulate!=0]
df_consulate.reset_index(drop=True, inplace=True)

In [16]:
#park
df_park=df_PoI[df_PoI.dum_park!=0]
df_park.reset_index(drop=True, inplace=True)

In [17]:
#theatre
df_theatre=df_PoI[df_PoI.dum_theatre!=0]
df_theatre.reset_index(drop=True, inplace=True)

In [18]:
#school
df_school=df_PoI[df_PoI.dum_school!=0]
df_school.reset_index(drop=True, inplace=True)

In [19]:
#library
df_library=df_PoI[df_PoI.dum_library!=0]
df_library.reset_index(drop=True, inplace=True)

In [20]:
#daycare
df_daycare=df_PoI[df_PoI.dum_daycare!=0]
df_daycare.reset_index(drop=True, inplace=True)

In [21]:
#cemetery
df_cemetery=df_PoI[df_PoI.dum_cemetery!=0]
df_cemetery.reset_index(drop=True, inplace=True)

In [22]:
# on utilise la base de données des taxi dispo sur le site de NYC
data = 'C:/Users/cleme/Documents/ENSAE/2A/S1/Python/Projet/Donnees/yellow_tripdata_2016-06.csv'

# !!! on prend un échantillon de 100 observations pour commencer
df = pd.read_csv(data, sep=',')
df.head()

#on ne garde que les courses payées en carte bleue pour avoir celles où le pourboire apparait
df = df[df.payment_type==1]
df = df[(df['pickup_longitude'] < -73.7) & (df['pickup_longitude'] > -74.1) & (df['pickup_latitude'] > 40.4) & (df['pickup_latitude'] < 51)]

df.reset_index(drop=True, inplace=True)


In [23]:
random.seed(35)
# assez lent, !!! on crée df0 = les 100 premières obs de la grosse base
df0 = df.sample(100).reset_index()
#on crée un identifiant unique par course
df0['Id_course']= df0.index
df0.head()
df0['pts'] = gpd.GeoSeries([Point(xy) for xy in zip(df0.pickup_longitude, df0.pickup_latitude)])
df0['circles'] = gpd.GeoSeries(df0.pts).buffer(0.005)
df0=gpd.GeoDataFrame(df0).set_geometry('circles', crs = crs)

df0.head()

# le problème c'est qu'on a quelques valeurs très éloignées de la zone, il faut absolument les supprimer
# si on ne les supprime pas (essayer avec les 100 premières obs), le plot (les boroughs de NYC) et le subplot
# (les cercles qu'on a dessiné) ne sont plus tout à la même échelle
# = > on sélectionne les observations dont les pickup coordinates ne sortent pas de la zone de NYC


Unnamed: 0,index,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,...,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,Id_course,pts,circles
0,4022580,2,2016-06-17 09:18:39,2016-06-17 09:25:56,1,1.27,-73.982132,40.771759,1,N,...,7.0,0.0,0.5,1.56,0.0,0.3,9.36,0,POINT (-73.98213195800781 40.77175903320313),POLYGON ((-73.97713195800782 40.77175903320313...
1,46413,1,2016-06-09 23:40:54,2016-06-09 23:56:33,1,3.2,-73.994591,40.750271,1,N,...,13.5,0.5,0.5,2.95,0.0,0.3,17.75,1,POINT (-73.99459075927734 40.75027084350585),POLYGON ((-73.98959075927735 40.75027084350585...
2,6822078,2,2016-06-20 08:39:35,2016-06-20 08:47:30,1,1.51,-74.014702,40.710003,1,N,...,7.5,0.0,0.5,1.66,0.0,0.3,9.96,2,POINT (-74.01470184326173 40.71000289916992),POLYGON ((-74.00970184326174 40.71000289916992...
3,7181837,1,2016-06-21 19:16:55,2016-06-21 19:27:35,1,2.4,-73.985298,40.727509,1,N,...,10.0,1.0,0.5,2.95,0.0,0.3,14.75,3,POINT (-73.98529815673827 40.72750854492188),POLYGON ((-73.98029815673827 40.72750854492188...
4,1303480,2,2016-06-05 16:05:52,2016-06-05 16:20:07,5,2.63,-73.979683,40.760971,1,N,...,12.0,0.0,0.5,2.56,0.0,0.3,15.36,4,POINT (-73.97968292236328 40.76097106933594),POLYGON ((-73.97468292236329 40.76097106933594...


In [80]:
# college
nb_coll_circle = gpd.sjoin(df_college, df0.copy(),op='within')
count_coll = nb_coll_circle.groupby('Id_course').size()
count_coll.name="nb_college"

#hospitals
nb_hosp_circle = gpd.sjoin(geodf_hospital, df0.copy(),op='within')
count_hosp = nb_hosp_circle.groupby('Id_course').size()
count_hosp.name="nb_hospital"


#religion
nb_rel_circle = gpd.sjoin(df_religion, df0.copy(),op='within')
count_rel = nb_rel_circle.groupby('Id_course').size()
count_rel.name="nb_religion"

#consulate
nb_cons_circle = gpd.sjoin(df_consulate, df0.copy(),op='within')
count_cons = nb_cons_circle.groupby('Id_course').size()
count_cons.name="nb_consulate"

#park
nb_park_circle = gpd.sjoin(df_park, df0.copy(),op='within')
count_park = nb_park_circle.groupby('Id_course').size()
count_park.name="nb_park"

#theatre
nb_th_circle = gpd.sjoin(df_theatre, df0.copy(),op='within')
count_th = nb_th_circle.groupby('Id_course').size()
count_th.name="nb_theatre"

#school
nb_sc_circle = gpd.sjoin(df_school, df0.copy(),op='within')
count_sc = nb_sc_circle.groupby('Id_course').size()
count_sc.name="nb_school"

#library
nb_lib_circle = gpd.sjoin(df_library, df0.copy(),op='within')
count_lib = nb_lib_circle.groupby('Id_course').size()
count_lib.name="nb_library"

#daycare
nb_dc_circle = gpd.sjoin(df_daycare, df0.copy(),op='within')
count_dc = nb_dc_circle.groupby('Id_course').size()
count_dc.name="nb_daycare"

#cemetery
nb_cem_circle = gpd.sjoin(df_cemetery, df0.copy(),op='within')
count_cem = nb_cem_circle.groupby('Id_course').size()
count_cem.name="nb_cemetery"


df0_count = pd.concat([df0, count_coll,count_hosp,count_rel,count_cons,count_park,count_th,count_sc,count_lib ,count_dc,count_cem], axis=1)
df0_count[['nb_college','nb_hospital','nb_religion','nb_consulate','nb_park','nb_theatre','nb_school','nb_library','nb_daycare','nb_cemetery']]=df0_count[['nb_college','nb_hospital','nb_religion','nb_consulate','nb_park','nb_theatre','nb_school','nb_library','nb_daycare','nb_cemetery']].fillna(0)

df0_count.head()
df0_count.columns


Index(['index', 'VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'Id_course', 'pts', 'circles',
       'nb_college', 'nb_hospital', 'nb_religion', 'nb_consulate', 'nb_park',
       'nb_theatre', 'nb_school', 'nb_library', 'nb_daycare', 'nb_cemetery'],
      dtype='object')

# On commence la régression !!!

In [85]:
# on divise en train et en test
f=df0_count.columns.get_loc("passenger_count")
i=df0_count.columns.get_loc("payment_type")
j=df0_count.columns.get_loc("extra")
k=df0_count.columns.get_loc("nb_college")

l=list(range(f,f+2))+list(range (i+1,j+1))+list(range(k,df0_count.shape[1]))

X, y = df0_count[df0_count.columns[l]], df0_count['tip_amount']

# On peut utiliser random_state=42 (par exemple) en option pour ne pas dependre du seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

X.columns

Index(['passenger_count', 'trip_distance', 'fare_amount', 'extra',
       'nb_college', 'nb_hospital', 'nb_religion', 'nb_consulate', 'nb_park',
       'nb_theatre', 'nb_school', 'nb_library', 'nb_daycare', 'nb_cemetery'],
      dtype='object')

In [86]:
# On emprunte le code sur le site de scikit-learn et on l'adapte a notre probleme

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [-0.16643366  0.0223585   0.10754155  1.07296331 -0.58471087  0.35527052
 -0.02253912 -0.01892997 -0.00589602  0.00596273  0.12693219  0.42599131
 -0.38104716 -0.58936733]
Mean squared error: 4.61
Variance score: 0.56
