# Import Python Librairies

In [16]:
import os
import pickle
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import expon

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

#Regression Models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RANSACRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

#Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier



# Import Data

In [3]:
def open_file(data_filename):
    current_directory = !pwd
    filepath = os.path.join(current_directory[0],data_filename)
    with open (filepath, "rb") as f:
        data = pickle.load(f)
    return data

In [4]:
X_test = open_file("data/X_test.pkl")
X_train = open_file("data/X_train.pkl")
y_test = open_file("data/y_test.pkl")
y_train = open_file("data/y_train.pkl")

In [5]:
print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)

(3794, 4086)
(15173, 4086)
(3794,)
(15173,)


# MACHINE LEARNING

## Regression

### Quick ML Regression

In [6]:
#Metrics = MSE

In [7]:
def reg_score(model):
    cv_score = cross_val_score(estimator=model,
                               X=X_train,
                               y=y_train,
                               cv=5,
                               scoring='neg_mean_squared_error')
    
    pipe = Pipeline(steps=[("estimator", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    
    print(f"Model used = {model}")
    print(f"Validation_score (mean, cv=5): {-1*cv_score.mean()}")
    print(f"Test_score : {mse}")
    print("-"*100)
    
    return cv_score, mse
    

In [8]:
model_0 = LinearRegression()
model_1 = Ridge(alpha=1)
model_2 = Ridge(alpha=10)
model_3 = SVR()
model_4 = KNeighborsRegressor(n_neighbors=5)
model_5 = RandomForestRegressor()
model_6 = lgb.LGBMRegressor(silent=True)


In [9]:
models = [model_0, model_1, model_2, model_3, model_4, model_5, model_6]
models_score = list()

for model in models:
    models_score.append(reg_score(model))

Model used = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Validation_score (mean, cv=5): 5.360465458871664
Test_score : 5.164022394969747
----------------------------------------------------------------------------------------------------
Model used = Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)
Validation_score (mean, cv=5): 4.722982062831336
Test_score : 4.7149205558509335
----------------------------------------------------------------------------------------------------
Model used = Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)
Validation_score (mean, cv=5): 4.953022065232442
Test_score : 4.823815310786237
----------------------------------------------------------------------------------------------------
Model used = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, 

KeyboardInterrupt: 

### Fine Tunning

In [10]:
def optimization_param_rand(models, param_distributions):

    for model in models:
        pipe = Pipeline(steps=[("estimator", model)])
        grid = RandomizedSearchCV(pipe, 
                                  param_distributions, 
                                  n_iter=20,
                                  cv=5, 
                                  scoring='neg_mean_squared_error',
                                  random_state=42)
        grid.fit(X_train, y_train)
        best_score = grid.best_score_
        best_params = grid.best_params_
        
        print(f"Model : {model}")
        print(best_score)
        print(best_params)
        print("-"*100)
        
        return -1*best_score, best_params

In [11]:
def optimization_param(models, param_grid):

    for model in models:
        pipe = Pipeline(steps=[("estimator", model)])
        grid = GridSearchCV(estimator=pipe, 
                            param_grid=param_grid, 
                            cv=5,
                            scoring='neg_mean_squared_error')
        
        grid.fit(X_train, y_train)
        best_score = grid.best_score_
        best_params = grid.best_params_
        
        print(f"Model : {model}")
        print(best_score)
        print(best_params)
        print("-"*100)
        
        return -1*best_score, best_params

In [12]:
param_distributions_svr = {"estimator__C": expon(scale=100),
                           "estimator__epsilon": expon(scale=0.1)}

# 35min pour runner cellule
svr_optim_score, svr_optim_params = optimization_param_rand(models=[model_3],
                                                            param_distributions=param_distributions_svr)

Model : SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
-4.269999902733349
{'estimator__C': 2.0799307999138623, 'estimator__epsilon': 0.3503557475158312}
----------------------------------------------------------------------------------------------------


In [13]:
param_grid_svr = [{"estimator__C": [1.75, 2.0, 2.25],
                   "estimator__epsilon": [0.30, 0.35, 0.40]}]

svr_optim_score_2, svr_optim_params_2 = optimization_param([model_3], 
                                                           param_grid_svr)




Model : SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
-4.265480873003834
{'estimator__C': 2.25, 'estimator__epsilon': 0.4}
----------------------------------------------------------------------------------------------------


In [14]:
model = SVR(C=1.0, 
            cache_size=200, 
            coef0=0.0, 
            degree=3, 
            epsilon=0.1, 
            gamma='scale',
            kernel='rbf', 
            max_iter=-1, 
            shrinking=True, 
            tol=0.001, 
            verbose=False)

In [21]:
model.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

### Save Model (SVR)

In [22]:
with open("models/svr.jbl", "wb") as f:
    joblib.dump(model,f)

# Prediction Test / Draft

In [18]:
def good_data(features, list_values):
	df = pd.DataFrame(columns=features)
	df.loc[0] = list_values
	return df

In [24]:
with open("test_file.pkl", "rb") as f:
    Test = pickle.load(f) 

In [26]:
model.predict(Test)[0]

90.02294114639355