In [15]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import pickle as pk

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe_connected"

In [2]:
#import libraries
%matplotlib notebook
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [9]:
df_apr = pd.read_csv('uber-raw-data-apr14.csv')
df_may = pd.read_csv('uber-raw-data-may14.csv')
df_jun = pd.read_csv('uber-raw-data-jun14.csv')
df_jul = pd.read_csv('uber-raw-data-jul14.csv')
df_aug = pd.read_csv('uber-raw-data-aug14.csv')
df_sep = pd.read_csv('uber-raw-data-sep14.csv')

In [10]:
df_def = pd.concat([df_apr, df_aug, df_jul, df_jun, df_may, df_sep])

In [11]:
df_def.columns = [x.lower().replace('/', '_') for x in df_def.columns]

In [12]:
df_def.date_time = pd.to_datetime(df_def['date_time'])

In [13]:
new_dates, new_times = zip(*[(d.date(), d.time()) for d in df_def['date_time']])
df_def = df_def.assign(new_date=new_dates, new_time=new_times)
df_def['hour'] = df_def.date_time.dt.hour
df_def['day'] = df_def.date_time.dt.dayofweek

In [14]:
df_def = df_def.drop(columns=["date_time"])

DROP OUTLIER

In [15]:
for i in ['lat', 'lon',]:
    df_def = df_def[~(np.abs(df_def[i]-df_def[i].mean()) > (3*df_def[i].std()))]

DETERMINATION DES COLONNES ET STANDARDSCALER

# data test / samedi à 20h

In [54]:
data_test = df_def.loc[(df_def['hour'] == 20) & (df_def['day'] == 5)]

In [55]:
data_test = data_test.drop(columns=['base', 'new_date', 'new_time'])

In [56]:
data_test.head()

Unnamed: 0,lat,lon,hour,day
6592,40.7735,-73.9599,20,5
6593,40.711,-74.0056,20,5
6594,40.7707,-73.962,20,5
6595,40.7154,-74.0076,20,5
6596,40.7644,-73.9719,20,5


In [57]:
from sklearn.preprocessing import StandardScaler

In [58]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1,2,3] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)     
    ])

In [60]:
# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_test.head())
X = preprocessor.fit_transform(data_test) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()

Preprocessing sur le train set...
          lat      lon  hour  day
6592  40.7735 -73.9599    20    5
6593  40.7110 -74.0056    20    5
6594  40.7707 -73.9620    20    5
6595  40.7154 -74.0076    20    5
6596  40.7644 -73.9719    20    5
...Terminé.
[[ 1.11333206  0.569842    0.          0.        ]
 [-0.74013632 -0.89110202  0.          0.        ]
 [ 1.03029667  0.50270891  0.          0.        ]
 [-0.60965215 -0.9550383   0.          0.        ]
 [ 0.84346706  0.18622432  0.          0.        ]]



In [64]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (2,30): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[47509.21326006424, 33092.94175988448, 22789.838570348627, 18749.188781460165, 15078.289619162306, 13465.147617578445, 12124.940949001513, 10929.709777738177, 10041.98419050188, 9179.873387445996, 8419.889689865091, 7889.524446157036, 7287.7877207698475, 6730.590328745163, 6252.643197065263, 5791.5090578946065, 5509.390707836174, 5118.730490887652, 4795.322214106702, 4591.953727780691, 4336.792803114304, 4189.611313999566, 3975.0636953452117, 3819.912564065936, 3618.6113740129845, 3462.79882172498, 3309.6911769720095, 3198.1894690813]


In [65]:
fig = px.line(x = range(2,30), y = wcss)
fig.show()

In [66]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,30): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X)
    s_score.append(silhouette_score(X, kmeans.predict(X)))

print(s_score)

[0.38671882174481775, 0.4271014316019649, 0.43387934361013825, 0.3858580182155104, 0.4095142392206252, 0.3493473039071469, 0.3545664958553138, 0.3602038408510312, 0.36253455114100375, 0.3654192376521782, 0.37279071410612297, 0.37783447943860937, 0.3769384583441521, 0.3788915778412327, 0.3849048729713969, 0.3882984895006331, 0.3931876242246231, 0.4033821637947759, 0.37694043105481956, 0.4101733694310996, 0.3874607751627536, 0.4117022332354219, 0.41134927646740227, 0.3865576398004663, 0.3978638153236354, 0.3887585068041197, 0.394552156461141, 0.391796667942741]


In [67]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,30), y = s_score)
fig.show(renderer="iframe")

In [70]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 4)
kmeans.fit(X)

KMeans(n_clusters=4)

In [71]:
data_test.loc[:,'Cluster_KMeans'] = kmeans.predict(X)
data_test.head()

Unnamed: 0,lat,lon,hour,day,Cluster_KMeans
6592,40.7735,-73.9599,20,5,2
6593,40.711,-74.0056,20,5,3
6594,40.7707,-73.962,20,5,2
6595,40.7154,-74.0076,20,5,3
6596,40.7644,-73.9719,20,5,2


In [72]:
fig = px.scatter_mapbox(data_test[(data_test.Cluster_KMeans >=0)],
        lat="lat", 
        lon="lon",
        color="Cluster_KMeans",
        mapbox_style = "carto-positron"
)

fig.show()

LUNDI TOUTES LES HEURES

In [73]:
df_lundi = df_def.loc[(df_def['hour'] >=0 ) & (df_def['day'] == 0)]

In [74]:
df_lundi = df_lundi.drop(columns=['base', 'new_date', 'new_time'])

In [75]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1,2,3] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

In [76]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)     
    ])

In [77]:
# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(df_lundi.head())
X_lundi = preprocessor.fit_transform(df_lundi) # fit_transform !!
print('...Terminé.')
print(X_lundi[0:5, :])
print()

Preprocessing sur le train set...
          lat      lon  hour  day
7785  40.7205 -73.9939     0    0
7786  40.7407 -74.0077     0    0
7787  40.7591 -73.9892     0    0
7788  40.7419 -74.0034     0    0
7789  40.7419 -74.0034     1    0
...Terminé.
[[-0.73576865 -0.5364991  -2.43883748  0.        ]
 [-0.07787508 -0.96454444 -2.43883748  0.        ]
 [ 0.52139432 -0.39071554 -2.43883748  0.        ]
 [-0.03879229 -0.83116799 -2.43883748  0.        ]
 [-0.03879229 -0.83116799 -2.26114925  0.        ]]



In [78]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (2,30): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_lundi)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[1129794.722531321, 829254.9941951112, 653805.5557519038, 536708.5860245911, 454004.5166821128, 410959.32885111513, 377628.7598536638, 349368.0707424444, 328461.8903473841, 310448.9363816328, 291845.17809982796, 277257.0325873496, 262646.13290646777, 249152.0935165029, 239890.5963873708, 231079.97110542285, 221172.78478469982, 214805.39542712696, 208550.56980617333, 202128.83254001045, 197227.88718442444, 189912.32496648072, 186579.6905785141, 180956.9490369369, 176424.93992059893, 172915.75946319706, 168928.66869061335, 164787.97734061943]


In [79]:
fig = px.line(x = range(2,30), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,30): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_lundi)
    s_score.append(silhouette_score(X_lundi, kmeans.predict(X_lundi)))

print(s_score)

In [None]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,30), y = s_score)
fig.show(renderer="iframe")

In [81]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 6)
kmeans.fit(X_lundi)

KMeans(n_clusters=6)

In [82]:
df_lundi.loc[:,'Cluster_KMeans'] = kmeans.predict(X_lundi)
df_lundi.head()

Unnamed: 0,lat,lon,hour,day,Cluster_KMeans
7785,40.7205,-73.9939,0,0,2
7786,40.7407,-74.0077,0,0,2
7787,40.7591,-73.9892,0,0,2
7788,40.7419,-74.0034,0,0,2
7789,40.7419,-74.0034,1,0,2


In [83]:
fig = px.scatter_mapbox(df_lundi[(df_lundi.Cluster_KMeans >=0)], 
        lat="lat", 
        lon="lon",
        hover_name = 'hour',                
        color="Cluster_KMeans",
        mapbox_style="carto-positron",
        animation_frame="hour"
)
fig.show()

In [None]:
##################################################################################################

MARDI TOUTES LES HEURES

In [42]:
df_mardi = df_def.loc[(df_def['hour'] >=0 ) & (df_def['day'] == 0)]

In [43]:
df_mardi = df_mardi.drop(columns=['base', 'new_date', 'new_time'])

In [44]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1,2,3] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

In [45]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)     
    ])

In [46]:
# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(df_mardi.head())
X_mardi = preprocessor.fit_transform(df_mardi) # fit_transform !!
print('...Terminé.')
print(X_mardi[0:5, :])
print()

Preprocessing sur le train set...
          lat      lon  hour  day
7785  40.7205 -73.9939     0    0
7786  40.7407 -74.0077     0    0
7787  40.7591 -73.9892     0    0
7788  40.7419 -74.0034     0    0
7789  40.7419 -74.0034     1    0
...Terminé.
[[-0.73576865 -0.5364991  -2.43883748  0.        ]
 [-0.07787508 -0.96454444 -2.43883748  0.        ]
 [ 0.52139432 -0.39071554 -2.43883748  0.        ]
 [-0.03879229 -0.83116799 -2.43883748  0.        ]
 [-0.03879229 -0.83116799 -2.26114925  0.        ]]



In [47]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_mardi)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[1129802.447903874, 829255.1344387186, 653791.8343781393, 536676.3498880836, 454002.9935050368, 410962.94961122214, 377627.13438831596, 349135.85133179097, 331891.24593300757]


In [48]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_mardi)
    s_score.append(silhouette_score(X_mardi, kmeans.predict(X_mardi)))

print(s_score)

In [None]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show(renderer="iframe")

In [49]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 3)
kmeans.fit(X_mardi)

KMeans(n_clusters=3)

In [50]:
df_mardi.loc[:,'Cluster_KMeans'] = kmeans.predict(X_mardi)
df_mardi.head()

Unnamed: 0,lat,lon,hour,day,Cluster_KMeans
7785,40.7205,-73.9939,0,0,1
7786,40.7407,-74.0077,0,0,1
7787,40.7591,-73.9892,0,0,1
7788,40.7419,-74.0034,0,0,1
7789,40.7419,-74.0034,1,0,1


In [51]:
fig = px.scatter_mapbox(df_mardi[(df_mardi.Cluster_KMeans >=-1)], 
        lat="lat", 
        lon="lon",
        hover_name = 'hour',                
        color="Cluster_KMeans",
        mapbox_style="carto-positron",
        animation_frame="hour"
)
fig.show()

In [None]:
##########################################################################

MERCREDI TOUTES LES HEURES

In [66]:
df_mercredi = df_def.loc[(df_def['hour'] >=0 ) & (df_def['day'] == 2)]

In [67]:
df_mercredi = df_mercredi.drop(columns=['base', 'new_date', 'new_time'])

In [68]:
# Création du pipeline pour les variables quantitatives
numeric_features = [0,1,2,3] # Positions des colonnes quantitatives dans X
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # pour normaliser les variables
])

In [69]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)     
    ])

In [70]:
# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(df_mercredi.head())
X_mercredi = preprocessor.fit_transform(df_mercredi) # fit_transform !!
print('...Terminé.')
print(X_mercredi[0:5, :])
print()

Preprocessing sur le train set...
          lat      lon  hour  day
1011  40.7458 -73.9843     0    2
1012  40.7285 -74.0467     0    2
1013  40.7732 -73.9546     0    2
1014  40.6550 -73.9786     0    2
1015  40.7405 -74.0040     0    2
...Terminé.
[[ 0.09234206 -0.16939352 -2.60553226  0.        ]
 [-0.50584175 -2.34341013 -2.60553226  0.        ]
 [ 1.03975456  0.86535476 -2.60553226  0.        ]
 [-3.0472585   0.02919453 -2.60553226  0.        ]
 [-0.09091657 -0.85574171 -2.60553226  0.        ]]



In [71]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_mercredi)
    wcss.append(kmeans.inertia_)
    
print(wcss)

[1488650.8613862246, 1121449.103733254, 883105.4345422259, 726807.8028821521, 611639.6355937524, 556697.4886511853, 508521.34459461254, 467813.11829976604, 435785.74665506394]


In [72]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters
s_score = []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_mercredi)
    s_score.append(silhouette_score(X_mercredi, kmeans.predict(X_mercredi)))

print(s_score)

In [None]:
# Affichage de scores en fonction du nombre de clusters
fig = px.bar(x = range(2,11), y = s_score)
fig.show(renderer="iframe")

In [None]:
# On ré-entraîne un KMeans avec le nombre optimal de clusters
kmeans = KMeans(n_clusters= 3)
kmeans.fit(X_mardi)

In [None]:
df_mercredi.loc[:,'Cluster_KMeans'] = kmeans.predict(X_mercredi)
df_mercredi.head()

In [None]:
fig = px.scatter_mapbox(df_mercredi[(df_mercredi.Cluster_KMeans >=-1)], 
        lat="lat", 
        lon="lon",
        hover_name = 'hour',                
        color="Cluster_KMeans",
        mapbox_style="carto-positron",
        animation_frame="hour"
)
fig.show()

In [1]:
############### MODELE NON RETENU   #########################################