# üîÑ Import les biblioth√®ques de python

In [0]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# **1Ô∏è‚É£ EDA & compr√©hension des donn√©es**
- >     Importer dataset sample 1‚ÄØ%
- >     V√©rifier les types de colonnes et les dates (√©chantillon)
- >     Nettoyer valeurs manquantes et doublons (√©chantillon)

In [0]:
df = pd.read_csv('yellowtaxisample1pct_hybrid_stratified.csv')

In [0]:
df.head()

In [0]:
df.shape

In [0]:
df.info()

In [0]:
# Les vides:
df.isnull().sum()

In [0]:
# Les doublons
df.duplicated().sum()

In [0]:
# ============================ #
# Modification de type colonnes:
# ============================ #

# Datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Fix airport fee
df['airport_fee'] = df['airport_fee'].fillna(df['Airport_fee'])
df = df.drop(columns=['Airport_fee'])

# Categorical
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype('category')
df['payment_type'] = df['payment_type'].astype('category')

# Int
df['passenger_count'] = df['passenger_count'].astype('Int64')
df['RatecodeID'] = df['RatecodeID'].astype('Int64')

In [0]:
df.info()

In [0]:
# ================ #
# Remplir les vides:
# ================ #

# Passenger count
df['passenger_count'] = df['passenger_count'].fillna(1)

# RatecodeID
df['RatecodeID'] = df['RatecodeID'].fillna(1)

# store_and_fwd_flag
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].fillna('N')

# congestion_surcharge
df['congestion_surcharge'] = df['congestion_surcharge'].fillna(0)

# airport_fee
df['airport_fee'] = df['airport_fee'].fillna(0)


In [0]:
df.isnull().sum()

In [0]:
# Passenger count
print("Passenger count:")
print("Min:", df['passenger_count'].min())
print("Max:", df['passenger_count'].max())

# Trip distance
print("\nTrip distance:")
print("Min:", df['trip_distance'].min())
print("Max:", df['trip_distance'].max())

# Fare amount
print("\nFare amount:")
print("Min:", df['fare_amount'].min())
print("Max:", df['fare_amount'].max())

# Duration in minutes
df['duration_min'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
print("\nDuration (min):")
print("Min:", df['duration_min'].min())
print("Max:", df['duration_min'].max())

In [0]:
df = df[(df['passenger_count'] > 0) & (df['passenger_count'] <= 4)]

# Trip distance: 0 < distance <= 50 miles (urban realistic)
df = df[(df['trip_distance'] > 0) & (df['trip_distance'] <= 50)]

# Fare amount: 2.5 =< fare <= 200$
df = df[(df['fare_amount'] >= 2.5) & (df['fare_amount'] <= 200)]

# V√©rification logique time
df = df[df['tpep_dropoff_datetime'] > df['tpep_pickup_datetime']]

# Duration: 0 < duration <= 180 minutes (1h 30min)
df['duration_min'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
df = df[(df['duration_min'] > 0) & (df['duration_min'] <= 180)]

# Speed check
df['speed_mph'] = df['trip_distance'] / (df['duration_min'] / 60)
df = df[(df['speed_mph'] > 1) & (df['speed_mph'] < 80)]


# 2Ô∏è‚É£  Analyse par statistiques inf√©rentielles (√©chantillon)
- Prix moyen d‚Äôune course (fare_amount)
- Distance moyenne d‚Äôune course (trip_distance)
- Dur√©e moyenne des courses
- Proportion des courses avec tip > 0
- Distribution des courses par heure/jour/semaine
- Comparaison des fares selon zones g√©ographiques (pickup/dropoff boroughs)
- Analyse des outliers
- Ratio tip/fare moyen par type de paiement (cash vs card)

In [0]:
# Prix moyen d‚Äôune course (fare_amount) 
# ============================
# Variable analys√©e
# ============================
x = df['fare_amount']

# ============================
# Taille de l'√©chantillon
# ============================
n = x.shape[0]

# ============================
# Statistiques descriptives
# ============================
mean_fare = x.mean()
std_fare = x.std(ddof=1)

# ============================
# Intervalle de confiance 95 %
# ============================
alpha = 0.05
z = stats.norm.ppf(1 - alpha / 2)

# Statistiques inf√©rentielles
ci_low = mean_fare - z * (std_fare / np.sqrt(n))
ci_high = mean_fare + z * (std_fare / np.sqrt(n))

# ============================
# Affichage des r√©sultats
# ============================
print("=== Analyse du prix des courses (√©chantillon 1 %) ===")
print(f"Taille de l'√©chantillon : {n}")
print(f"Prix moyen de la course : {mean_fare:.2f} $")
print(f"Intervalle de confiance √† 95 % : [{ci_low:.2f} $ ; {ci_high:.2f} $]")


In [0]:
# Distance moyenne d‚Äôune course (trip_distance) 
y = df['trip_distance']
n = y.shape[0]

# Statistiques descriptives
mean_distance = y.mean()
std_distance = y.std(ddof=1)

# Intervalle de confiance 95 %
alpha = 0.05
m = stats.norm.ppf(1 - alpha / 2)

ci_low = mean_distance - m * (std_distance / np.sqrt(n))
ci_high = mean_distance + m * (std_distance / np.sqrt(n))

# Affichage des r√©sultats
print("=== Analyse de la distance des courses (√©chantillon 1 %) ===")
print(f"Taille de l'√©chantillon : {n}")
print(f"Distance moyenne de la course : {mean_distance:.2f} miles")
print(f"Intervalle de confiance √† 95 % : [{ci_low:.2f} ; {ci_high:.2f}] miles")


In [0]:
# Dur√©e moyenne des courses
y = df['duration_min']
n = y.shape[0]

# Moyenne et √©cart-type
mean_duration = y.mean()
std_duration = y.std(ddof=1)

# Intervalle de confiance 95 %
alpha = 0.05
m = stats.norm.ppf(1 - alpha / 2)

ci_low = mean_duration - m * (std_duration / np.sqrt(n))
ci_high = mean_duration + m * (std_duration / np.sqrt(n))

# Affichage
print("=== Analyse de la dur√©e des courses (√©chantillon 1 %) ===")
print(f"Taille de l'√©chantillon : {n}")
print(f"Dur√©e moyenne de la course : {mean_duration:.2f} minutes")
print(f"Intervalle de confiance √† 95 % : [{ci_low:.2f} ; {ci_high:.2f}] minutes")


In [0]:
plt.hist(df['duration_min'], bins=50, color='lightgreen', edgecolor='black')
plt.title("Distribution des dur√©es des courses")
plt.xlabel("Dur√©e (minutes)")
plt.ylabel("Nombre de courses")
plt.show()


In [0]:
# Variable : tip > 0
tip_positive = (df['tip_amount'] > 0).astype(int)  # 1 si tip > 0, 0 sinon
n = tip_positive.shape[0]

# Proportion
prop_tip = tip_positive.mean()
print(f"Proportion des courses avec tip > 0 : {prop_tip:.4f}")

# Erreur standard
se = np.sqrt(prop_tip * (1 - prop_tip) / n)

# Intervalle de confiance
alpha = 0.05
m = stats.norm.ppf(1 - alpha / 2)  # ‚âà 1.96
ci_low = prop_tip - m * se
ci_high = prop_tip + m * se

# Conversion en %
prop_tip_pct = prop_tip * 100
ci_low_pct = ci_low * 100
ci_high_pct = ci_high * 100

# Affichage
print("=== Analyse des tips (√©chantillon 1 %) ===")
print(f"Taille de l'√©chantillon : {n}")
print(f"Proportion des courses avec tip > 0 : {prop_tip_pct:.2f} %")
print(f"Intervalle de confiance √† 95 % : [{ci_low_pct:.2f} % ; {ci_high_pct:.2f} %]")


In [0]:
import matplotlib.pyplot as plt

plt.bar(['Tip > 0', 'Tip = 0'], [prop_tip, 1-prop_tip], color=['orange', 'lightgray'])
plt.title("Proportion des courses avec tip")
plt.ylabel("Proportion")
plt.show()


In [0]:
# Heure de pickup
df['hour'] = df['tpep_pickup_datetime'].dt.hour  # 0-23

# Jour de la semaine (0 = lundi, 6 = dimanche)
df['day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek

# Num√©ro de la semaine
df['week_of_year'] = df['tpep_pickup_datetime'].dt.isocalendar().week

# Afficher le r√©sultat sous forme de table
df[['tpep_pickup_datetime', 'hour', 'day_of_week', 'week_of_year']].head()


In [0]:
# nombre de courses
# =============== #
# Par heure (heures de pointe)
hour_dist = df['hour'].value_counts().sort_index()

# visualisation:
plt.figure(figsize=(10,5))
plt.bar(hour_dist.index, hour_dist.values, color='skyblue', edgecolor='black')
plt.xticks(range(0,24))
plt.xlabel("Heure de la journ√©e")
plt.ylabel("Nombre de courses")
plt.title("Distribution des courses par heure")
plt.show()

In [0]:
# Par jour de la semaine
day_dist = df['day_of_week'].value_counts().sort_index()

# Visualisation
plt.figure(figsize=(5,3))
plt.bar(['Lun','Mar','Mer','Jeu','Ven','Sam','Dim'], day_dist.values, color='gray', edgecolor='black')
plt.xlabel("Jour de la semaine")
plt.ylabel("Nombre de courses")
plt.title("Distribution des courses par jour")
plt.show()


In [0]:
# Par semaine 
week_dist = df['week_of_year'].value_counts().sort_index()

# Visualisation
plt.figure(figsize=(12,5))
plt.bar(week_dist.index, week_dist.values, color='pink', edgecolor='black')
plt.xlabel("Semaine")
plt.ylabel("Nombre de courses")
plt.title("Courses par semaine")
plt.show()


In [0]:
# Pickup fares
prix_par_pickup = df.groupby("PULocationID")["fare_amount"].mean().sort_values(ascending=False)
print("=== Prix moyen par Pickup LocationID ===")
print(prix_par_pickup)

# Dropoff fares
prix_par_dropoff = df.groupby("DOLocationID")["fare_amount"].mean().sort_values(ascending=False)
print("=== Prix moyen par Dropoff LocationID ===")
print(prix_par_dropoff)


In [0]:
# ======================================================================= #
# Comparaison des fares selon zones g√©ographiques (pickup/dropoff boroughs)
# ======================================================================= #

# Pickup
pickup_fares = df.groupby("PULocationID")["fare_amount"].mean().sort_values(ascending=False)

# Dropoff
dropoff_fares = df.groupby("DOLocationID")["fare_amount"].mean().sort_values(ascending=False)

comparaison_zones = pd.DataFrame({
    "Pickup Fare Moyen": pickup_fares,
    "Dropoff Fare Moyen": dropoff_fares
}).reset_index()

comparaison_zones = comparaison_zones.rename(columns={"index": "LocationID"})

# Afficher top 20 zones les plus ch√®res
print(comparaison_zones.head(20))


print("Nombre de Pickup LocationID diff√©rents :", df['PULocationID'].nunique())
print("Nombre de Dropoff LocationID diff√©rents :", df['DOLocationID'].nunique())


In [0]:
# Outliers de fare_amount
Q1 = df['fare_amount'].quantile(0.25)
Q3 = df['fare_amount'].quantile(0.75)
IQR = Q3 - Q1

seuil_bas = Q1 - 1.5 * IQR
seuil_haut = Q3 + 1.5 * IQR

outliers_fare = df[
    (df['fare_amount'] < seuil_bas) |
    (df['fare_amount'] > seuil_haut)
]

print("Nombre d‚Äôoutliers (fare_amount) :", outliers_fare.shape[0])


In [0]:
# Outliers de trip_distance
Q1 = df['trip_distance'].quantile(0.25)
Q3 = df['trip_distance'].quantile(0.75)
IQR = Q3 - Q1

seuil_haut = Q3 + 1.5 * IQR

outliers_distance = df[df['trip_distance'] > seuil_haut]

print("Nombre d‚Äôoutliers (trip_distance) :", outliers_distance.shape[0])


In [0]:
# Cr√©er le ratio tip / fare
df["tip_fare_ratio"] = df["tip_amount"] / df["fare_amount"]

# Mapper les types de paiement (optionnel mais plus lisible)
payment_map = {
    1: "Card",
    2: "Cash"
}
df["payment_label"] = df["payment_type"].map(payment_map)

# Garder seulement cash et card
df_filtered = df[df["payment_label"].isin(["Card", "Cash"])]

# Calcul du ratio moyen par type de paiement
ratio_moyen = (
    df_filtered
    .groupby("payment_label")["tip_fare_ratio"]
    .mean()
)

print(ratio_moyen)
