In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import root_mean_squared_error

from sklearn.svm import SVR

%matplotlib inline


In [2]:
df = pd.read_csv("../../data/processed/compteur_name_dataset.csv", index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1797907 entries, 12886 to 708012
Data columns (total 9 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   Nom du compteur     object
 1   Comptage horaire    int64 
 2   Jour                int64 
 3   Mois                int64 
 4   Année               int64 
 5   Heure               int64 
 6   Jour_semaine        int64 
 7   Jour férié          int64 
 8   Vacances scolaires  int64 
dtypes: int64(8), object(1)
memory usage: 137.2+ MB


In [4]:
y = df_t["Comptage horaire"]
X = df_t.drop(columns=["Comptage horaire"])

enc = OneHotEncoder()
X = enc.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Standardisation
# col_norm = ["Jour", "Mois", "Année", "Heure", "Jour_semaine", "Jour férié", "Vacances scolaires"]

# scaler = StandardScaler()
# df[col_norm] = scaler.fit_transform(df[col_norm])

In [7]:
X_train.shape

(14795, 81)

In [8]:
params = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear', 'poly'],
    'gamma': [0.001, 0.1, 0.5, 'scale', 'auto'],
}

In [9]:
# Parametres pour SVM : kernel, C, gamma
svr = SVR()
clf = GridSearchCV(svr, params, scoring='r2', n_jobs=-1)
clf.fit(X_train, y_train)
# svr.score(X_test, y_test)

In [10]:
clf.best_params_

{'C': 1, 'gamma': 0.5, 'kernel': 'poly'}

In [13]:
clf.best_score_

np.float64(0.9048462438110443)

In [14]:
clf.score(X_test, y_test)

0.9139023939330039

In [12]:
svr_t = SVR(C=1, gamma=0.5, kernel='poly')
svr_t.fit(X_train, y_train)

In [12]:
all_compteurs = list(df["Nom du compteur"].unique())

compteurs_to_search = [
    "Totem Cours la Reine O-E",
    "Totem 64 Rue de Rivoli O-E",
    "Totem 73 boulevard de Sébastopol S-N",
    "Totem 64 Rue de Rivoli E-O",
    "Totem 73 boulevard de Sébastopol N-S",
    "Totem Cours la Reine E-O",
    "Totem 85 quai d'Austerlitz SE-NO",
    "Totem 85 quai d'Austerlitz NO-SE",
    "90 Rue De Sèvres SO-NE",
    "90 Rue De Sèvres NE-SO",
    "21 boulevard Auguste Blanqui SO-NE",
    "10 boulevard Auguste Blanqui NE-SO",
    "Pont de la Concorde S-N",
    "Pont de la Concorde N-S",
]

In [4]:
compteurs_models = {}

In [None]:
params = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'poly'],
    'gamma': [0.001, 0.1, 0.5, 'scale'],
}

In [10]:
def grid_search_compteur(name, params):
    df_t = df.loc[df["Nom du compteur"] == name].drop(columns=["Nom du compteur"])
    y = df_t["Comptage horaire"]
    X = df_t.drop(columns=["Comptage horaire"])
    enc = OneHotEncoder()
    X = enc.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    svr = SVR(cache_size=300)
    clf = GridSearchCV(svr, params, scoring='r2', n_jobs=-1)
    print(f"Grid searching for {name}")
    clf.fit(X_train, y_train)
    compteurs_models[name] = clf
    print(f"   Best params: {clf.best_params_}")
    print(f"   Best score (internal CV): {clf.best_score_}")          
    print(f"   Score on test set: {clf.score(X_test, y_test)}")
    print()

In [11]:
def grid_search_compteur2(name, params):
    df_t = df.loc[df["Nom du compteur"] == name].drop(columns=["Nom du compteur"])
    y = df_t["Comptage horaire"]
    X = df_t.drop(columns=["Comptage horaire"])
    enc = OneHotEncoder()
    X = enc.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    svr = SVR(cache_size=300, kernel='poly', gamma=0.5)
    clf = GridSearchCV(svr, params, scoring='r2', n_jobs=-1)
    print(f"Grid searching for {name}")
    clf.fit(X_train, y_train)
    compteurs_models2[name] = clf
    print(f"   Best params: {clf.best_params_}")
    print(f"   Best score (internal CV): {clf.best_score_}")          
    print(f"   Score on test set: {clf.score(X_test, y_test)}")
    print()

In [6]:
for name in compteurs_to_search:
    grid_search_compteur(name)

Grid searching for Totem Cours la Reine O-E
   Best params: {'C': 10, 'gamma': 0.5, 'kernel': 'poly'}
   Best score (internal CV): 0.8260804903977604
   Score on test set: 0.697056837835685

Grid searching for Totem 64 Rue de Rivoli O-E
   Best params: {'C': 10, 'gamma': 0.5, 'kernel': 'poly'}
   Best score (internal CV): 0.9141353643754991
   Score on test set: 0.9135948975748883

Grid searching for Totem 73 boulevard de Sébastopol S-N
   Best params: {'C': 10, 'gamma': 0.5, 'kernel': 'poly'}
   Best score (internal CV): 0.9328332135562467
   Score on test set: 0.9342065592457661

Grid searching for Totem 64 Rue de Rivoli E-O
   Best params: {'C': 10, 'gamma': 0.5, 'kernel': 'poly'}
   Best score (internal CV): 0.9053624051056655
   Score on test set: 0.9076871841343599

Grid searching for Totem 73 boulevard de Sébastopol N-S
   Best params: {'C': 10, 'gamma': 0.5, 'kernel': 'poly'}
   Best score (internal CV): 0.9119828559686376
   Score on test set: 0.9118651274503099

Grid searchin

In [7]:
compteurs_models2 = {}

In [12]:
params2 = {
    'C': [1, 3, 5, 8, 10],
    'degree': [3, 5],
}

In [14]:
for name in compteurs_to_search:
    grid_search_compteur2(name, params2)

Grid searching for Totem Cours la Reine O-E
   Best params: {'C': 1, 'degree': 5}
   Best score (internal CV): 0.8398403211878342
   Score on test set: 0.7114447555183359

Grid searching for Totem 64 Rue de Rivoli O-E
   Best params: {'C': 8, 'degree': 3}
   Best score (internal CV): 0.9142527612151813
   Score on test set: 0.9140793495229224

Grid searching for Totem 73 boulevard de Sébastopol S-N
   Best params: {'C': 8, 'degree': 3}
   Best score (internal CV): 0.9333792304911643
   Score on test set: 0.9348844224244343

Grid searching for Totem 64 Rue de Rivoli E-O
   Best params: {'C': 8, 'degree': 3}
   Best score (internal CV): 0.9055130337610958
   Score on test set: 0.9082244295636862

Grid searching for Totem 73 boulevard de Sébastopol N-S
   Best params: {'C': 8, 'degree': 3}
   Best score (internal CV): 0.9127164872286162
   Score on test set: 0.9126110146577768

Grid searching for Totem Cours la Reine E-O
   Best params: {'C': 1, 'degree': 5}
   Best score (internal CV): 0

In [23]:
x = 0
for _, c in compteurs_models2.items():
    x += c.best_params_["C"]
print(x/14)

3.5714285714285716


In [6]:
compteurs_models3 = {}

In [9]:
def svr_compteur(name):
    df_t = df.loc[df["Nom du compteur"] == name].drop(columns=["Nom du compteur"])
    y = df_t["Comptage horaire"]
    X = df_t.drop(columns=["Comptage horaire"])
    enc = OneHotEncoder()
    X = enc.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(name)
    clf = SVR(cache_size=300, kernel='poly', gamma=0.5, C=1, degree=3)
    clf.fit(X_train, y_train)
    compteurs_models3[name] = clf
    score_train = clf.score(X_train, y_train)
    score_test = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"   Score on train set: {score_train}")
    print(f"   Score on test set: {score_test}")
    print(f"   RMSE on test set: {rmse}")
    print()
    return score_train, score_test, rmse

In [13]:
avg_train = 0
avg_test = 0
avg_rmse = 0
for name in all_compteurs:
    train, test, rmse = svr_compteur(name)
    avg_train += train
    avg_test += test
    avg_rmse += rmse


106 avenue Denfert Rochereau NE-SO
   Score on train set: 0.38843868049434793
   Score on test set: 0.305332989989491
   RMSE on test set: 86.56660271832696

Quai d'Orsay O-E
   Score on train set: 0.8508632903004018
   Score on test set: 0.7307539173319146
   RMSE on test set: 72.1786202907005

Totem Cours la Reine O-E
   Score on train set: 0.8140205452585797
   Score on test set: 0.6497508054154302
   RMSE on test set: 70.82494343217911

132 rue Lecourbe NE-SO
   Score on train set: 0.5769211492454969
   Score on test set: 0.47658368042150234
   RMSE on test set: 42.46046731362929

Totem 64 Rue de Rivoli O-E
   Score on train set: 0.9022342532121537
   Score on test set: 0.8817130671639145
   RMSE on test set: 71.2371414583742

Totem 73 boulevard de Sébastopol S-N
   Score on train set: 0.9242664538521804
   Score on test set: 0.9108417353361946
   RMSE on test set: 78.77334247573907

Quai d'Orsay E-O
   Score on train set: 0.8407097870129467
   Score on test set: 0.8037077533867082

In [14]:
print(avg_train/len(all_compteurs))
print(avg_test/len(all_compteurs))
print(avg_rmse/len(all_compteurs))

0.9079092316286067
0.8659536559975083
26.587977034154402
