In [None]:
!pip install xgboost
import xgboost as xgb
import pandas as pd
import numpy as np

#visualisations
import matplotlib.pyplot as plt
import seaborn as sns

#préprocessing ML
from sklearn.preprocessing import StandardScaler

#modèle de ML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import sklearn.model_selection.KFold

In [None]:
mobilisation = pd.read_csv("LFB Mobilisation data Last 3 years.csv", header=0, sep=";")
incident = pd.read_csv("LFB Incident data Last 3 years.csv", header=0, sep=";")


In [None]:
total = pd.merge(incident, mobilisation, on='IncidentNumber')

In [None]:
total = total.dropna(axis = 1)

In [None]:
total['minute'] = total['AttendanceTimeSeconds']/60
total['minute'] =  total['minute'].astype('int64', copy=False)


In [None]:
numerical_cols = [contname for contname in total.columns if total[contname].dtype in ['float64', 'int64']]
total_numerical = total[numerical_cols]

In [None]:
qualitative_cols = [contname for contname in total.columns if total[contname].dtype in ['object']]
total_qualitative = total[qualitative_cols]

In [None]:
total_quali = total_qualitative.drop(["IncidentNumber", "Postcode_district", "DateOfCall", "TimeOfCall", "UPRN", "FRS", "DateAndTimeMobilised", "DateAndTimeArrived", "PlusCode_Code", "PlusCode_Description"], axis = 1)


In [None]:
test = pd.get_dummies(total_quali)

In [None]:
df = pd.concat([total_numerical, test], axis = 1)

In [None]:
X = df.drop(["AttendanceTimeSeconds", "minute"], axis = 1)
y = df.minute

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)

In [None]:
xgbr = xgb.XGBRegressor(verbosity=0) 

In [None]:
xgbr.fit(X_train, y_train)

In [None]:
score = xgbr.score(X_train, y_train)  
print("Training score R²: ", score)

In [None]:
scores = cross_val_score(xgbr, X_train, y_train,cv=10)
print("Mean cross-validation score: %.2f" % scores.mean())


In [None]:
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbr, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())


In [None]:
y_pred = xgbr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE: %.2f" % mse)

In [None]:
print("RMSE: %.2f" % (mse**(1/2.0)))

In [None]:
x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, y_pred, label="predicted")

plt.title("Predicted data")

plt.legend()
plt.show()