In [None]:
#Importation des libraries
import plotly.express as px
from sklearn.cluster import KMeans
import plotly.graph_objects as go
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
import seaborn as sns

In [None]:
#Chargement des données
!wget url_to_the_zip_file "https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Machine+Learning+non+Supervis%C3%A9/Projects/uber-trip-data.zip"
!unzip uber-trip-data.zip


--2023-10-20 10:10:54--  http://url_to_the_zip_file/
Resolving url_to_the_zip_file (url_to_the_zip_file)... failed: Name or service not known.
wget: unable to resolve host address ‘url_to_the_zip_file’
--2023-10-20 10:10:54--  https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Machine+Learning+non+Supervis%C3%A9/Projects/uber-trip-data.zip
Resolving full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com (full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com)... 16.12.18.18, 3.5.225.114
Connecting to full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com (full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com)|16.12.18.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 101042186 (96M) [application/zip]
Saving to: ‘uber-trip-data.zip.2’


2023-10-20 10:10:57 (33.7 MB/s) - ‘uber-trip-data.zip.2’ saved [101042186/101042186]

FINISHED --2023-10-20 10:10:57--
Total wall clock time: 3.2s
Downloaded: 1 files, 96M in 2.9s (33.7 MB/s)
Archive:  uber-trip-da

In [None]:
data = pd.read_csv("/content/uber-trip-data/uber-raw-data-apr14.csv")

In [None]:
#Statistiques de base
print("Number of rows : {}".format(data.shape[0]))
print()

print("Display of dataset: ")
display(data.head())
print()

print("Basics statistics: ")
data_desc = data.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*data.isnull().sum()/data.shape[0])

Number of rows : 564516

Display of dataset: 


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512



Basics statistics: 


Unnamed: 0,Date/Time,Lat,Lon,Base
count,564516,564516.0,564516.0,564516
unique,41999,,,5
top,4/7/2014 20:21:00,,,B02682
freq,97,,,227808
mean,,40.740005,-73.976817,
std,,0.036083,0.050426,
min,,40.0729,-74.7733,
25%,,40.7225,-73.9977,
50%,,40.7425,-73.9848,
75%,,40.7607,-73.97,



Percentage of missing values: 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

In [None]:
#Sélection d'un échantillon de 10000 lignes
data_sample = data.sample(10000)
data_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
161885,4/24/2014 15:57:00,40.7441,-73.9733,B02598
150875,4/22/2014 22:22:00,40.7305,-73.9951,B02598
433164,4/15/2014 5:19:00,40.706,-74.0164,B02682
225681,4/2/2014 22:50:00,40.7512,-73.902,B02617
382168,4/7/2014 18:56:00,40.7167,-74.0082,B02682


In [None]:
fig = px.scatter_mapbox(
        data_sample,
        lat="Lat",
        lon="Lon",
        color="Base",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Nettoyage de data_sample

#Transformation de la colone 'Date/Time' au format datetime

from datetime import datetime
date_format = '%m/%d/%Y %H:%M:%S'

data_sample['Date/Time'] = data_sample['Date/Time'].apply(lambda x: datetime.strptime(x, date_format))
data_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 161885 to 188312
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date/Time  10000 non-null  datetime64[ns]
 1   Lat        10000 non-null  float64       
 2   Lon        10000 non-null  float64       
 3   Base       10000 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 390.6+ KB


In [None]:
data_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
161885,2014-04-24 15:57:00,40.7441,-73.9733,B02598
150875,2014-04-22 22:22:00,40.7305,-73.9951,B02598
433164,2014-04-15 05:19:00,40.706,-74.0164,B02682
225681,2014-04-02 22:50:00,40.7512,-73.902,B02617
382168,2014-04-07 18:56:00,40.7167,-74.0082,B02682


In [None]:
#Fractionnement de la colone 'Date/Time' en 3 colonnes pour l'heure, le jour de la semaine et le jour du mois.
#Nous travaillerons avec un échantillon du mois d'avril 2014, ce mois n'ayant pas de jour férié.

data_sample['hour'] = data_sample['Date/Time'].dt.hour
data_sample['dayofweek'] = data_sample['Date/Time'].dt.dayofweek
data_sample['dayofmonth'] = data_sample['Date/Time'].dt.day

#Suppression de l'ancienne colone 'Date/Time' qui est redondante avec nos nouvelles colonnes.
data_apr14 = data_sample.drop(columns=["Date/Time", "Base"])

data_apr14.head()

Unnamed: 0,Lat,Lon,hour,dayofweek,dayofmonth
161885,40.7441,-73.9733,15,3,24
150875,40.7305,-73.9951,22,1,22
433164,40.706,-74.0164,5,1,15
225681,40.7512,-73.902,22,2,2
382168,40.7167,-74.0082,18,0,7


In [None]:
#Fractionnement de data_sample en 3 dataframe : 1/Etude du traffic sur une base horaire, 2/Etude du traffic sur la base des jours de la semaine, 3/ Etude du traffic sur la base des jours du mois

data_hour = data_apr14.drop(columns=["dayofweek","dayofmonth"])
data_week = data_apr14.drop(columns=["hour","dayofmonth"])
data_month = data_apr14.drop(columns=["dayofweek","hour"])


In [None]:
#Affiche d'un plan avec plotly du dataframe data_hour
fig = px.scatter_mapbox(
        data_hour,
        lat="Lat",
        lon="Lon",
        color="hour",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Affiche d'un plan avec plotly du dataframe data_week
fig = px.scatter_mapbox(
        data_week,
        lat="Lat",
        lon="Lon",
        color="dayofweek",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Affiche d'un plan avec plotly du dataframe data_month

fig = px.scatter_mapbox(
        data_month,
        lat="Lat",
        lon="Lon",
        color="dayofmonth",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Test avec data_hour
#Preprocessing des données. L'échantillon étant au format numérique, aucun preprocessing numérique n'est nécessaire, seulement de la standardisation.

numeric_features = [0, 1] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()


# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_hour.head())
X = preprocessor.fit_transform(data_hour) # fit_transform !!
print('...Terminé.')
print(X[0:5, :])
print()


Preprocessing sur le train set...
            Lat      Lon  hour
161885  40.7441 -73.9733    15
150875  40.7305 -73.9951    22
433164  40.7060 -74.0164     5
225681  40.7512 -73.9020    22
382168  40.7167 -74.0082    18
...Terminé.
[[ 0.10782558  0.06628403]
 [-0.27011975 -0.37154452]
 [-0.95097715 -0.79933113]
 [ 0.30513527  1.49826456]
 [-0.6536231  -0.63464333]]



In [None]:
#Commençons avec DBSCAN. Nous utlisons les paramètres suivants : `eps=0.2`, `min_samples=100` and `metric="manhattan"`

db = DBSCAN(eps=0.2, min_samples=100, metric="manhattan")

db.fit(X)

In [None]:
#Vérifions combien de cluster ont été créé ?
np.unique(db.labels_)

array([-1,  0,  1,  2,  3])

In [None]:
#Ajoutons une nouvelle colonne au dataframe indiquant les clusters
data_hour["cluster"] = db.labels_
data_hour.head()

Unnamed: 0,Lat,Lon,hour,cluster,cluster_kmeans
161885,40.7441,-73.9733,15,0,4
150875,40.7305,-73.9951,22,0,0
433164,40.706,-74.0164,5,0,0
225681,40.7512,-73.902,22,-1,2
382168,40.7167,-74.0082,18,0,0


In [None]:
#Visualisation des clusters sur un plan en excluant ceux considéré comme des outliers par DBSCAN.

fig = px.scatter_mapbox(
        data_hour[data_hour.cluster != -1],
        lat="Lat",
        lon="Lon",
        color="cluster",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Essayons avec d'autres données. Utilisons la date du mardi 2 avril à 17h du dataframe data_apr14. Création d'un nouveau datafram data_test avec ce nouvel échantillon.

data_test = data_apr14.loc[(data_apr14["hour"] == 17) & (data_apr14["dayofweek"] == 2)]
data_test = data_test.drop(columns="dayofmonth")

data_test.head()

Unnamed: 0,Lat,Lon,hour,dayofweek
155476,40.7741,-73.8726,17,2
337687,40.7615,-73.9995,17,2
337646,40.7591,-73.971,17,2
491023,40.8059,-73.9649,17,2
213133,40.7233,-73.9966,17,2


In [None]:
#Test avec data_test
#Preprocessing des données. L'échantillon étant au format numérique, aucun preprocessing numérique n'est nécessaire, seulement de la standardisation.

numeric_features = [0, 1] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()


# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_test.head())
X_db = preprocessor.fit_transform(data_test) # fit_transform !!
print('...Terminé.')
print(X_db[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  hour  dayofweek
155476  40.7741 -73.8726    17          2
337687  40.7615 -73.9995    17          2
337646  40.7591 -73.9710    17          2
491023  40.8059 -73.9649    17          2
213133  40.7233 -73.9966    17          2
...Terminé.
[[ 1.04496062  2.81279452]
 [ 0.63890997 -0.48642923]
 [ 0.56156698  0.25453119]
 [ 2.06975512  0.41312271]
 [-0.59213249 -0.41103326]]



In [None]:
#Avec DBSCAN. Paramètres : `eps=0.4`, `min_samples=10` and `metric="manhattan"`

db = DBSCAN(eps=0.1, min_samples=10, metric="manhattan")

db.fit(X_db)

In [None]:
#Vérifions combien de cluster ont été créé
np.unique(db.labels_)

array([-1,  0])

In [None]:
#Ajoutons une nouvelle colonne au dataframe indiquant les clusters
data_test["cluster"] = db.labels_
data_test.head()

Unnamed: 0,Lat,Lon,hour,dayofweek,cluster
155476,40.7741,-73.8726,17,2,-1
337687,40.7615,-73.9995,17,2,-1
337646,40.7591,-73.971,17,2,0
491023,40.8059,-73.9649,17,2,-1
213133,40.7233,-73.9966,17,2,-1


In [None]:
#Visualisation des clusters sur un plan en excluant ceux considéré comme des outliers par DBSCAN.

fig = px.scatter_mapbox(
        data_test[data_test.cluster != -1],
        lat="Lat",
        lon="Lon",
        color="cluster",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Essayons maintenant d'utiliser KMEANS sur notre data_test
#Preprocessing des données. L'échantillon étant au format numérique, aucun preprocessing numérique n'est nécessaire, seulement de la standardisation.

numeric_features = [0, 1] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()


# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_test.head())
X_km = preprocessor.fit_transform(data_test) # fit_transform !!
print('...Terminé.')
print(X_km[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  hour  dayofweek  cluster
155476  40.7741 -73.8726    17          2       -1
337687  40.7615 -73.9995    17          2       -1
337646  40.7591 -73.9710    17          2        0
491023  40.8059 -73.9649    17          2       -1
213133  40.7233 -73.9966    17          2       -1
...Terminé.
[[ 1.04496062  2.81279452]
 [ 0.63890997 -0.48642923]
 [ 0.56156698  0.25453119]
 [ 2.06975512  0.41312271]
 [-0.59213249 -0.41103326]]



In [None]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_km)
    wcss.append(kmeans.inertia_)

print(wcss)





















[217.4243945152216, 117.64000558504131, 85.14126119386171, 62.21632592196173, 42.422442948604356, 33.05177945472137, 26.252004648096467, 21.080937241892123, 17.75877792701492]


In [None]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters

from sklearn.metrics import silhouette_score

# Calcul de la moyen du silhouette score
sil = []

# Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_km)
    sil.append(silhouette_score(X_km, kmeans.predict(X_km)))
print(sil)

















[0.793240332814907, 0.5135362998175481, 0.5185579942952115, 0.5254451456509005, 0.5269959094642716, 0.5027322518105127, 0.46344052884660913, 0.4684657548350735, 0.46225500424201477]








In [None]:
fig = px.line(x = range(2,11), y = sil)
fig.show()

In [None]:
kmeans = KMeans(n_clusters=6, random_state=0)

# Fit kmeans sur notre dataset
kmeans.fit(X_km)





In [None]:
data_test["cluster_kmeans"] = kmeans.predict(X_km)
data_test.head()

Unnamed: 0,Lat,Lon,hour,dayofweek,cluster,cluster_kmeans
155476,40.7741,-73.8726,17,2,-1,3
337687,40.7615,-73.9995,17,2,-1,1
337646,40.7591,-73.971,17,2,0,1
491023,40.8059,-73.9649,17,2,-1,3
213133,40.7233,-73.9966,17,2,-1,0


In [None]:
#Visualisation des clusters sur un plan en excluant ceux considéré comme des outliers par KMEANS.

fig = px.scatter_mapbox(
        data_test[data_test.cluster != -1],
        lat="Lat",
        lon="Lon",
        color="cluster_kmeans",
        mapbox_style="carto-positron"
)

fig.show()

In [None]:
#Essayons maintenant d'utiliser KMEANS sur notre data_hour
#Preprocessing des données. L'échantillon étant au format numérique, aucun preprocessing numérique n'est nécessaire, seulement de la standardisation.

numeric_features = [0, 1] # Positions des colonnes quantitatives dans X
numeric_transformer = StandardScaler()


# On combine les transformers dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Preprocessings sur le dataset
print("Preprocessing sur le train set...")
print(data_hour.head())
X_km = preprocessor.fit_transform(data_hour) # fit_transform !!
print('...Terminé.')
print(X_km[0:5, :])
print()

Preprocessing sur le train set...
            Lat      Lon  hour  cluster
161885  40.7441 -73.9733    15        0
150875  40.7305 -73.9951    22        0
433164  40.7060 -74.0164     5        0
225681  40.7512 -73.9020    22       -1
382168  40.7167 -74.0082    18        0
...Terminé.
[[ 0.10782558  0.06628403]
 [-0.27011975 -0.37154452]
 [-0.95097715 -0.79933113]
 [ 0.30513527  1.49826456]
 [-0.6536231  -0.63464333]]



In [None]:
# Utilisation de la méthode Elbow pour trouver le nombre optimal de clusters

wcss =  []
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_km)
    wcss.append(kmeans.inertia_)

print(wcss)





















[14270.030883241921, 9535.096916999235, 7732.562197585356, 6314.489076416821, 5019.338588535293, 4160.855256773477, 3548.498616031135, 3112.7639094207534, 2785.71697588369]


In [None]:
fig = px.line(x = range(2,11), y = wcss)
fig.show()

In [None]:
# Utilisation du silhouette_score pour déterminer le nombre optimal de clusters

# Calcul de la moyen du silhouette score
sil = []

# Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels
for i in range (2,11):
    kmeans = KMeans(n_clusters= i)
    kmeans.fit(X_km)
    sil.append(silhouette_score(X_km, kmeans.predict(X_km)))
print(sil)





















[0.6884693144113939, 0.42962750189429955, 0.43694373781540946, 0.45680893171231474, 0.47193413019165126, 0.4840090773041917, 0.4863581133384849, 0.39551128752636117, 0.39638863376389405]


In [None]:
fig = px.line(x = range(2,11), y = sil)
fig.show()

In [None]:
kmeans = KMeans(n_clusters=6, random_state=0)

# Fit kmeans sur notre dataset
kmeans.fit(X_km)





In [None]:

data_hour["cluster_kmeans"] = kmeans.predict(X_km)
data_hour.head()

Unnamed: 0,Lat,Lon,hour,cluster,cluster_kmeans
161885,40.7441,-73.9733,15,0,4
150875,40.7305,-73.9951,22,0,0
433164,40.706,-74.0164,5,0,0
225681,40.7512,-73.902,22,-1,2
382168,40.7167,-74.0082,18,0,0


In [None]:
#Visualisation des clusters sur un plan en excluant ceux considéré comme des outliers par KMEANS.

fig = px.scatter_mapbox(
        data_hour[data_hour.cluster != -1],
        lat="Lat",
        lon="Lon",
        color="cluster_kmeans",
        mapbox_style="carto-positron"
)

fig.show()