In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv('full_data_flightdelay.csv')
print('Nombre de lignes et colonnes', df.shape)
df.head(10)

Nombre de lignes et colonnes (6489062, 26)


Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
5,1,7,0,0001-0559,3,1,10,180,Frontier Airlines Inc.,13056,...,5,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
6,1,7,0,0700-0759,6,1,29,186,Frontier Airlines Inc.,13056,...,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
7,1,7,1,0001-0559,7,1,10,186,Frontier Airlines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
8,1,7,0,0001-0559,7,1,10,180,Frontier Airlines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
9,1,7,0,0600-0659,8,1,27,186,Frontier Airlines Inc.,13056,...,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91


In [5]:
#Echantillonnage pour prediction
df_sample = df.sample(n=100000, random_state=42)

In [6]:
#Variables importantes pour la prediction
Variables_importantes = ['MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'CONCURRENT_FLIGHTS', 'NUMBER_OF_SEATS', 'CARRIER_NAME', 'PLANE_AGE', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'AWND', 'DEP_TIME_BLK']

print(df_sample[Variables_importantes])


         MONTH  DAY_OF_WEEK  DISTANCE_GROUP  CONCURRENT_FLIGHTS  \
984735       3            5               3                  27   
4740332      9            7               5                   7   
6361351     12            1               3                  15   
4012189      8            3               6                  22   
4789024      9            5               2                   3   
...        ...          ...             ...                 ...   
1076182      3            5               2                  84   
2242964      5            5               1                   3   
2202270      5            2               4                  34   
4340268      9            2               4                  11   
343878       1            7               7                  21   

         NUMBER_OF_SEATS            CARRIER_NAME  PLANE_AGE  \
984735                69   Midwest Airline, Inc.         11   
4740332              128  American Airlines Inc.          6   
636135

In [14]:
X = df_sample[Variables_importantes]
y = df_sample['DEP_DEL15']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

""" Encodage des variables catégorielles"""

cat_col = ['CARRIER_NAME', 'DEPARTING_AIRPORT', 'PREVIOUS_AIRPORT', 'DEP_TIME_BLK']
label_encoder = {}

for col in cat_col:
    X_train[col] = X_train[col].astype(str)
    le.fit(X_train[col])

    X_train[col] = le.transform(X_train[col])

    X_test[col] = X_test[col].astype(str)
    X_test[col] = X_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

    label_encoder[col] = le

#Normalisation des variables numériques

num_cols = ['MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'CONCURRENT_FLIGHTS', 'NUMBER_OF_SEATS', 'PLANE_AGE', 'PRCP', 'SNOW', 'TMAX', 'AWND']


scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

"""Entrainement du modèle random forest"""
rf =  RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)

#Prédiction
y_pred = rf.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :\n", classification_report(y_test, y_pred))


Accuracy : 0.8151

Classification report :
               precision    recall  f1-score   support

           0       0.82      0.99      0.90     16248
           1       0.57      0.06      0.11      3752

    accuracy                           0.82     20000
   macro avg       0.69      0.52      0.50     20000
weighted avg       0.77      0.82      0.75     20000



In [15]:
#Importance des Variables=
Variables_importantes = pd.Series(rf.Varaibles_importances_, index=X_train.columns).sort_values(ascending=False)
print("\nTop 10 features par importance :\n", Variables_importantes.head(10))


Top 10 features par importance :
 AWND                  0.103221
TMAX                  0.102631
CONCURRENT_FLIGHTS    0.102064
PREVIOUS_AIRPORT      0.093967
DEP_TIME_BLK          0.091471
PLANE_AGE             0.083732
DEPARTING_AIRPORT     0.079158
NUMBER_OF_SEATS       0.063333
MONTH                 0.058663
DISTANCE_GROUP        0.057852
dtype: float64
