In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import joblib

In [2]:
# VERSION AVEC TRAIN TEST SPLIT

pricing = pd.read_csv('src/get_around_pricing_project.csv').iloc[:,1:]

X = pricing.drop("rental_price_per_day", axis=1)
y = pricing["rental_price_per_day"]

bool_columns = ['private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']
for col in bool_columns:
    X[col] = X[col].astype(int)

numeric_features = ["mileage", "engine_power"]
categorical_features = ["model_key", "fuel", "paint_color", "car_type"] + bool_columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown = "ignore", drop = 'first'), categorical_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred_model = model.predict(X_test)

print('Score R2 training set :', model.score(X_train, y_train))
print('Score R2 test set :', model.score(X_test, y_test))
print()
mse_regressor = mean_squared_error(y_test, y_pred_model)
print('MSE test :', mse_regressor)
print('RMSE test :', np.sqrt(mse_regressor))

# RANDOMFOREST (OVERFITTING)
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [None, 10, 20],
#     'min_samples_leaf': [1, 5, 10],
#     'min_samples_split': [2, 5, 10]
# }
# scorer = make_scorer(r2_score)
# model = GridSearchCV(estimator=RandomForestRegressor(random_state=0), param_grid=param_grid, scoring=scorer, cv=5, verbose=1, n_jobs=-1)
# model.fit(X_train, y_train)
# best_params = model.best_params_
# print('Best Hyperparameters:', best_params)
# best_model = grid_search.best_estimator_
# y_pred_best = best_model.predict(X_test)
# r2_best_model = r2_score(y_test, y_pred_best)
# print('R-squared with Best Model:', r2_best_model)

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [7]:
# VERSION GLOBALE POUR SCRIPT

pricing = pd.read_csv('src/get_around_pricing_project.csv').iloc[:,1:]

X = pricing.drop("rental_price_per_day", axis=1)
y = pricing["rental_price_per_day"]

bool_columns = ['private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']
for col in bool_columns:
    X[col] = X[col].astype(int)

numeric_features = ["mileage", "engine_power"]
categorical_features = ["model_key", "fuel", "paint_color", "car_type"] + bool_columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown = "ignore", drop = "first"), categorical_features)
    ])

X_train = preprocessor.fit_transform(X)

joblib.dump(preprocessor, 'src/preprocessor.pkl')
joblib.dump(preprocessor, 'fastapi/src/preprocessor.pkl')

model = LinearRegression()

model.fit(X_train, y)

joblib.dump(model, 'src/model.pkl')
joblib.dump(model, 'fastapi/src/model.pkl')

print('Score R2 training set :', model.score(X_train, y))

Score R2 training set : 0.7119525296878819


In [8]:
list = ['Alfa Romeo', 0, 66, 'diesel', 'beige', 'convertible', 1, 1, 1, 1, 1, 1, 1]

data = pd.DataFrame([list], columns=pricing.drop("rental_price_per_day", axis=1).columns)

loaded_preprocessor = joblib.load('src/preprocessor.pkl')
data = loaded_preprocessor.transform(data)

loaded_model = joblib.load('src/model.pkl')
prediction = loaded_model.predict(data)

prediction

array([140.31218846])

In [18]:
import requests
api_url = "http://localhost:4000/predict"
data = {"filters_list": list}
response = requests.post(api_url, json=data)

if response.status_code == 200:
    result = response.json()
    print(result['prediction'])
else:
    print("Erreur lors de la requête à l'API.")

140.31218845857995


In [17]:
list

['Alfa Romeo', 0, 66, 'diesel', 'beige', 'convertible', 1, 1, 1, 1, 1, 1, 1]