In [21]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
import pandas as pd

In [93]:
def RMSE(x,y):
  return np.sqrt(mean_squared_error(x,y))

In [94]:
set = pd.read_csv("waiting_times_train.csv")
set.describe()
set.head()

Unnamed: 0,DATETIME,ENTITY_DESCRIPTION_SHORT,ADJUST_CAPACITY,DOWNTIME,CURRENT_WAIT_TIME,TIME_TO_PARADE_1,TIME_TO_PARADE_2,TIME_TO_NIGHT_SHOW,WAIT_TIME_IN_2H
0,2022-02-05 11:45:00,Water Ride,247.0,0,20,,,,30.0
1,2019-02-24 10:45:00,Water Ride,247.0,0,30,375.0,,495.0,25.0
2,2021-07-17 15:45:00,Pirate Ship,280.5,0,35,,,,35.0
3,2022-04-03 19:45:00,Pirate Ship,230.35,0,15,-135.0,,195.0,10.0
4,2021-10-20 10:30:00,Pirate Ship,153.0,0,15,,,,10.0


In [95]:
meteo = pd.read_csv("weather_data.csv")
train_set = set.merge(meteo, on= "DATETIME", how = "left")


In [96]:

null_rows = ["TIME_TO_PARADE_1","TIME_TO_PARADE_2","TIME_TO_NIGHT_SHOW"]

for j in null_rows:
    train_set[j] = train_set[j].fillna(600.0)

train_set["ENTITY_DESCRIPTION_SHORT"] = train_set["ENTITY_DESCRIPTION_SHORT"].replace({
    "Water Ride" : 0,
    "Pirate Ship": 0.5,
    "Flying Coaster": 1
})

train_set['DATETIME'] = pd.to_datetime(train_set['DATETIME'])
train_set['ANNEE'] = train_set['DATETIME'].dt.year
train_set['MOIS'] = train_set['DATETIME'].dt.month
train_set['MINUTES_JOUR'] = train_set['DATETIME'].dt.hour * 60 + train_set['DATETIME'].dt.minute



train_set.head()

  train_set["ENTITY_DESCRIPTION_SHORT"] = train_set["ENTITY_DESCRIPTION_SHORT"].replace({


Unnamed: 0,DATETIME,ENTITY_DESCRIPTION_SHORT,ADJUST_CAPACITY,DOWNTIME,CURRENT_WAIT_TIME,TIME_TO_PARADE_1,TIME_TO_PARADE_2,TIME_TO_NIGHT_SHOW,WAIT_TIME_IN_2H,temp,...,feels_like,pressure,humidity,wind_speed,rain_1h,snow_1h,clouds_all,ANNEE,MOIS,MINUTES_JOUR
0,2022-02-05 11:45:00,0.0,247.0,0,20,600.0,600.0,600.0,30.0,6.0175,...,3.63,1026.75,93.0,3.15,0.38375,0.139065,64.0,2022,2,705
1,2019-02-24 10:45:00,0.0,247.0,0,30,375.0,600.0,495.0,25.0,7.92,...,6.01,1035.75,41.75,3.0025,0.162423,0.347362,0.0,2019,2,645
2,2021-07-17 15:45:00,0.5,280.5,0,35,600.0,600.0,600.0,35.0,21.77,...,22.04,1025.0,78.0,4.5325,0.243329,0.184573,100.0,2021,7,945
3,2022-04-03 19:45:00,0.5,230.35,0,15,-135.0,600.0,195.0,10.0,6.865,...,5.275,1023.0,45.5,2.3425,0.242228,0.461012,86.5,2022,4,1185
4,2021-10-20 10:30:00,0.5,153.0,0,15,600.0,600.0,600.0,10.0,15.14,...,15.02,1010.5,88.5,6.885,1.0,0.181307,79.5,2021,10,630


In [97]:
Y = train_set['WAIT_TIME_IN_2H']
print(Y)
categories = train_set.columns.tolist()
print(categories)

0        30.0
1        25.0
2        35.0
3        10.0
4        10.0
         ... 
37013    10.0
37014    20.0
37015    10.0
37016    45.0
37017    20.0
Name: WAIT_TIME_IN_2H, Length: 37018, dtype: float64
['DATETIME', 'ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME', 'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2', 'TIME_TO_NIGHT_SHOW', 'WAIT_TIME_IN_2H', 'temp', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'snow_1h', 'clouds_all', 'ANNEE', 'MOIS', 'MINUTES_JOUR']


In [98]:
train_set["snow_1h"] = train_set["snow_1h"].fillna(0)

print("Nombre de valeurs NaN par colonne :")
print(train_set.isnull().sum())


Nombre de valeurs NaN par colonne :
DATETIME                    0
ENTITY_DESCRIPTION_SHORT    0
ADJUST_CAPACITY             0
DOWNTIME                    0
CURRENT_WAIT_TIME           0
TIME_TO_PARADE_1            0
TIME_TO_PARADE_2            0
TIME_TO_NIGHT_SHOW          0
WAIT_TIME_IN_2H             0
temp                        0
dew_point                   0
feels_like                  0
pressure                    0
humidity                    0
wind_speed                  0
rain_1h                     0
snow_1h                     0
clouds_all                  0
ANNEE                       0
MOIS                        0
MINUTES_JOUR                0
dtype: int64


In [99]:

max, min = train_set["WAIT_TIME_IN_2H"].max(), train_set["WAIT_TIME_IN_2H"].min()
for col in train_set.columns:
    train_set[col] = (train_set[col] - train_set[col].min()) / (train_set[col].max() - train_set[col].min())
    
train_set.head()
    

Unnamed: 0,DATETIME,ENTITY_DESCRIPTION_SHORT,ADJUST_CAPACITY,DOWNTIME,CURRENT_WAIT_TIME,TIME_TO_PARADE_1,TIME_TO_PARADE_2,TIME_TO_NIGHT_SHOW,WAIT_TIME_IN_2H,temp,...,feels_like,pressure,humidity,wind_speed,rain_1h,snow_1h,clouds_all,ANNEE,MOIS,MINUTES_JOUR
0,0.864131,0.0,0.323678,0.0,0.129032,1.0,1.0,0.761194,0.193548,0.198898,...,0.216751,0.688462,0.915663,0.25495,0.040274,0.087016,0.64,1.0,0.090909,0.234043
1,0.103116,0.0,0.323678,0.0,0.193548,0.791667,1.0,0.656716,0.16129,0.242144,...,0.266835,0.826923,0.298193,0.242781,0.00886,0.217353,0.0,0.25,0.090909,0.148936
2,0.720813,0.5,0.36819,0.0,0.225806,1.0,1.0,0.761194,0.225806,0.55697,...,0.604167,0.661538,0.73494,0.369018,0.020343,0.115492,1.0,0.75,0.545455,0.574468
3,0.904641,0.5,0.301555,0.0,0.096774,0.319444,1.0,0.358209,0.064516,0.218162,...,0.251368,0.630769,0.343373,0.188325,0.020187,0.288466,0.865,1.0,0.272727,0.914894
4,0.787784,0.5,0.198778,0.0,0.096774,1.0,1.0,0.761194,0.064516,0.406262,...,0.456439,0.438462,0.861446,0.563119,0.127742,0.113448,0.795,0.75,0.818182,0.12766


In [100]:
X_train, X_test, Y_train, Y_test = train_test_split(
    train_set[ ['ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME', 'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2', 'TIME_TO_NIGHT_SHOW','temp', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'snow_1h', 'clouds_all','ANNEE', 'MINUTES_JOUR', 'MOIS']], train_set['WAIT_TIME_IN_2H'], 
    test_size=0.2,      # 20% pour le test
    random_state=42,    # Seed pour la reproductibilité
    shuffle=True        # Mélanger avant de séparer (par défaut)
)

In [101]:
def poly_fit(X, Y, deg):
            
    #prend X et renvoit un tableau avec les X, X**2 ect jusqu'à X**deg
    X_poly = PolynomialFeatures(degree=deg).fit_transform(X)
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_poly,Y)
    return lin_reg

#pour appliquer la regression sur de nouvelles valeures
def poly_apply(lin_reg, degree,X):
    
    
    X_poly = PolynomialFeatures(degree=degree).fit_transform(X)
    return lin_reg.predict(X_poly)


In [105]:
#calcul du rmse + affichage des prédictions à coté des vraies valeurs
deg = 2

lin_reg = poly_fit(X_train,Y_train, deg = deg)

RMSE_train = RMSE(poly_apply(lin_reg,deg,X_train),Y_train)


print(RMSE_train)

0.06165375290987041


In [85]:

def arrondir_multiple_5(valeurs):
    """Arrondit les valeurs au multiple de 5 le plus proche"""
    return np.round(valeurs / 5) * 5

In [106]:
RMSE_test = RMSE(poly_apply(lin_reg,deg,X_test),Y_test)
print(RMSE_test)

RMSE_test_arr = RMSE(arrondir_multiple_5(poly_apply(lin_reg,deg,X_test)),Y_test)
print(RMSE_test_arr)

0.062647235832449
0.179245769127818


In [112]:
set = pd.read_csv("waiting_times_X_test_val.csv")

meteo = pd.read_csv("weather_data.csv")
train_set = set.merge(meteo, on= "DATETIME", how = "left")
null_rows = ["TIME_TO_PARADE_1","TIME_TO_PARADE_2","TIME_TO_NIGHT_SHOW"]

for j in null_rows:
    train_set[j] = train_set[j].fillna(600.0)

train_set["ENTITY_DESCRIPTION_SHORT"] = train_set["ENTITY_DESCRIPTION_SHORT"].replace({
    "Water Ride" : 0,
    "Pirate Ship": 0.5,
    "Flying Coaster": 1
})

train_set['DATETIME'] = pd.to_datetime(train_set['DATETIME'])
train_set['ANNEE'] = train_set['DATETIME'].dt.year
train_set['MOIS'] = train_set['DATETIME'].dt.month
train_set['MINUTES_JOUR'] = train_set['DATETIME'].dt.hour * 60 + train_set['DATETIME'].dt.minute



categories = train_set.columns.tolist()
train_set["snow_1h"] = train_set["snow_1h"].fillna(0)

for col in train_set.columns:
    train_set[col] = (train_set[col] - train_set[col].min()) / (train_set[col].max() - train_set[col].min())
    
    
X_val = train_set[ ['ENTITY_DESCRIPTION_SHORT', 'ADJUST_CAPACITY', 'DOWNTIME', 'CURRENT_WAIT_TIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2', 'TIME_TO_NIGHT_SHOW','temp', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'snow_1h', 'clouds_all','ANNEE', 'MINUTES_JOUR', 'MOIS']]


Y_val = poly_apply(lin_reg,deg,X_val)

Y_val = Y_val*max + (1 - Y_val)*min
print(Y_val)

  train_set["ENTITY_DESCRIPTION_SHORT"] = train_set["ENTITY_DESCRIPTION_SHORT"].replace({


[24.72475394 28.84715928 35.97860055 ... 10.11259399  0.7662653
 24.05379852]


In [116]:
set = pd.read_csv("waiting_times_X_test_val.csv")
new_set = set [["DATETIME","ENTITY_DESCRIPTION_SHORT"]]
new_set["y_pred"] = Y_val
new_set["KEY"]="Validation"


new_set.to_csv("val.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_set["y_pred"] = Y_val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_set["KEY"]="Validation"


In [None]:
# Evaluate RMSE for polynomial degrees from 1 to 8
degrees = range(1, 9)  # Define the range of polynomial degrees to evaluate
RMSE_train_list = []  # List to store RMSE for training data
RMSE_test_list = []   # List to store RMSE for test data

# Loop through each degree, fit the model, and calculate RMSE
for deg in degrees:
        
    lin_reg = poly_fit(X_train, Y_train,deg)
    
    RMSE_train = RMSE(poly_apply(lin_reg, deg, X_train), Y_train)
    RMSE_test = RMSE(poly_apply(lin_reg, deg, X_test),Y_test)


    RMSE_train_list.append(RMSE_train)
    RMSE_test_list.append(RMSE_test)

    print(f"Degree = {deg}, RMSE_train = {RMSE_train:.3f}, RMSE_test = {RMSE_test:.3f}")

# Plot RMSE for training and test sets across different polynomial degrees
plt.plot(degrees, RMSE_train_list, label='Train RMSE', marker='o')
plt.plot(degrees, RMSE_test_list , label='Test RMSE', marker='o')
plt.xlabel('Polynomial Degree')
plt.ylabel('RMSE')
plt.title('RMSE for Training and Test Sets')
plt.legend()
plt.show()

Degree = 1, RMSE_train = 0.066, RMSE_test = 0.067
Degree = 2, RMSE_train = 0.062, RMSE_test = 0.063
Degree = 3, RMSE_train = 0.052, RMSE_test = 0.057
