# identification des valeurs illogique et des outliers

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [None]:
file_path = "../data_clean/taxi_trip_2016-clean1.parquet"

df = pd.read_parquet(file_path)

## 1- Indentification des données illogiques

In [5]:
# Marquer les erreur logique de < à 0 ou <= 0 anormal
df["data_ok_flag"] = np.where(
    (df['trip_distance'] > 0) &
    (df['trip_time'] > 0) &
    (df['fare_amount'] >= 0) &
    (df['extra'] >= 0) &
    (df['mta_tax'] >= 0) &
    (df['tip_amount'] >= 0) &
    (df['tolls_amount'] >= 0) &
    (df['improvement_surcharge'] >= 0) &
    (df['total_amount'] > 0), True, False) # "data_ok"->True, "data_nok"-> False

In [6]:
df.head()

Unnamed: 0,id,VendorID,date,tpep_pickup_datetime,pickup_hour,passenger_count,trip_distance,trip_time,RatecodeID,store_and_fwd_flag,...,pickup_neighborhood_id,dropoff_neighborhood_id,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,data_ok_flag
0,1,2,2016-01-01,2016-01-01,0,2,1.1,0.0,1,N,...,1.0,1.0,7.5,0.5,0.5,0.0,0.0,0.3,8.8,False
1,2,2,2016-01-01,2016-01-01,0,5,4.9,0.0,1,N,...,2.0,26.0,18.0,0.5,0.5,0.0,0.0,0.3,19.3,False
2,3,2,2016-01-01,2016-01-01,0,1,10.54,0.0,1,N,...,3.0,9.0,33.0,0.5,0.5,0.0,0.0,0.3,34.3,False
3,4,2,2016-01-01,2016-01-01,0,1,4.75,0.0,1,N,...,4.0,8973.0,16.5,0.0,0.5,0.0,0.0,0.3,17.3,False
4,5,2,2016-01-01,2016-01-01,0,3,1.76,0.0,1,N,...,5.0,29.0,8.0,0.0,0.5,0.0,0.0,0.3,8.8,False


## 2- Identifications des outliers sur trip distance et trip_time

In [7]:
# Définir les variables explicatives (X) et la variable cible (Y)
X_columns = ["trip_time", "trip_distance"]  
Y_column = "fare_amount" 

In [8]:
# trie des données avant mise en place du model
df_trie = df[df["data_ok_flag"] == True]

In [None]:
# Filtrer les lignes avec des valeurs nulles (nécessaire pour statsmodels)
df_trie = df_trie.dropna(subset=[Y_column] + X_columns) # type: ignore

In [10]:
# Ajouter une constante pour la régression (intercept)
X = sm.add_constant(df_trie[X_columns])  
y = df_trie[Y_column]

In [11]:
# Modèle de régression quantile (quantile = 0.5 → médiane)
quantile = 0.5  # Changer à 0.25 ou 0.75 pour des bornes plus strictes
model = sm.QuantReg(y, X).fit(q=quantile)

In [12]:
# Prédiction du Y attendu
df_trie["Y_Pred"] = model.predict(X)

In [13]:
# Calcul des résidus (erreur absolue entre Y et Y_Pred)
df_trie["Residual"] = np.abs(df_trie[Y_column] - df_trie["Y_Pred"])

# Calcul des quantiles des résidus
Q1 = df_trie["Residual"].quantile(0.25)
Q3 = df_trie["Residual"].quantile(0.75)
IQR = Q3 - Q1

# Seuil pour détecter les outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Marquer les outliers
df_trie["Outlier_Flag"] = np.where((df_trie["Residual"] > upper_bound), True, False) # "Outlier"->True, "Normal"-> False

In [14]:

df_trie["Outlier_Flag"].describe()

count     34285260
unique           2
top          False
freq      29542943
Name: Outlier_Flag, dtype: object

In [15]:
# suppression des colonne de construction
df_trie.drop(columns=["Y_Pred"], inplace=True)
df_trie.drop(columns=["Residual"], inplace=True)

In [16]:
df_trie.head()

Unnamed: 0,id,VendorID,date,tpep_pickup_datetime,pickup_hour,passenger_count,trip_distance,trip_time,RatecodeID,store_and_fwd_flag,...,dropoff_neighborhood_id,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,data_ok_flag,Outlier_Flag
5,6,2,2016-01-01,2016-01-01,0,2,5.52,1110.0,1,N,...,549.0,19.0,0.5,0.5,0.0,0.0,0.3,20.3,True,True
6,7,2,2016-01-01,2016-01-01,0,2,7.45,1605.0,1,N,...,16.0,26.0,0.5,0.5,0.0,0.0,0.3,27.3,True,True
7,8,1,2016-01-01,2016-01-01,0,1,1.2,714.0,1,N,...,29.0,9.0,0.5,0.5,0.0,0.0,0.3,10.3,True,False
8,9,1,2016-01-01,2016-01-01,0,1,6.0,672.0,1,N,...,12149.0,18.0,0.5,0.5,0.0,0.0,0.3,19.3,True,True
9,10,2,2016-01-01,2016-01-01,0,1,3.21,666.0,1,N,...,209.0,11.5,0.5,0.5,0.0,0.0,0.3,12.8,True,False


In [17]:
# merge de df et df_trie
df = pd.merge(df, df_trie[["id","Outlier_Flag"]], how="left", left_on="id", right_on="id")

df["Outlier_Flag"] = df["Outlier_Flag"].fillna(True)

  df["Outlier_Flag"] = df["Outlier_Flag"].fillna(True)


In [18]:
df.head(20)

Unnamed: 0,id,VendorID,date,tpep_pickup_datetime,pickup_hour,passenger_count,trip_distance,trip_time,RatecodeID,store_and_fwd_flag,...,dropoff_neighborhood_id,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,data_ok_flag,Outlier_Flag
0,1,2,2016-01-01,2016-01-01,0,2,1.1,0.0,1,N,...,1.0,7.5,0.5,0.5,0.0,0.0,0.3,8.8,False,True
1,2,2,2016-01-01,2016-01-01,0,5,4.9,0.0,1,N,...,26.0,18.0,0.5,0.5,0.0,0.0,0.3,19.3,False,True
2,3,2,2016-01-01,2016-01-01,0,1,10.54,0.0,1,N,...,9.0,33.0,0.5,0.5,0.0,0.0,0.3,34.3,False,True
3,4,2,2016-01-01,2016-01-01,0,1,4.75,0.0,1,N,...,8973.0,16.5,0.0,0.5,0.0,0.0,0.3,17.3,False,True
4,5,2,2016-01-01,2016-01-01,0,3,1.76,0.0,1,N,...,29.0,8.0,0.0,0.5,0.0,0.0,0.3,8.8,False,True
5,6,2,2016-01-01,2016-01-01,0,2,5.52,1110.0,1,N,...,549.0,19.0,0.5,0.5,0.0,0.0,0.3,20.3,True,True
6,7,2,2016-01-01,2016-01-01,0,2,7.45,1605.0,1,N,...,16.0,26.0,0.5,0.5,0.0,0.0,0.3,27.3,True,True
7,8,1,2016-01-01,2016-01-01,0,1,1.2,714.0,1,N,...,29.0,9.0,0.5,0.5,0.0,0.0,0.3,10.3,True,False
8,9,1,2016-01-01,2016-01-01,0,1,6.0,672.0,1,N,...,12149.0,18.0,0.5,0.5,0.0,0.0,0.3,19.3,True,True
9,10,2,2016-01-01,2016-01-01,0,1,3.21,666.0,1,N,...,209.0,11.5,0.5,0.5,0.0,0.0,0.3,12.8,True,False


## 3- Enregistrement des données en parquet

In [None]:
df.to_parquet("../data_clean/taxi_trip_2016-clean.parquet", index=False)