In [None]:
# Importation des bibliothèques

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import xgboost as xgb
from math import ceil
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [None]:
# Class DataAnalysis

class DataAnalysis:
   def __init__(self, df) -> None:
      self.df = df.copy()
      self.categorical_data = df.select_dtypes(include='object')
      self.numeric_data = df.select_dtypes("number")
   
   # Numeric variable analysis
   def check_null(self) -> None:
      print(f'Nbre de valeur manquante dans la dataframe :\n {self.df.isnull().sum().sum()}')
      if (self.df.isnull().sum().sum()>1):
         print(f"Nbre de valeur manquante par colonne de la dataframe :\n {self.df.isnull().sum()} ")
   def num_boxplot(self) -> None:
      NumericAnalysis(self.numeric_data).boxplot()

   def corr_heatmap(self) -> None :
      NumericAnalysis(self.numeric_data).corr_matrix()

   # Categorical variable analysis
   def cat_summary(self) -> None :
      CategoricalAnalysis(self.categorical_data).summary()
   def cat_encode(self) :
      CategoricalAnalysis(self.categorical_data).encode(self.categorical_data)


In [None]:
      
# Class Categorical Analysis

class CategoricalAnalysis:
   def __init__(self, df) -> None:
      self.df = df.copy()
   def summary(self) -> None:
      # Comptage valeur unique et description des varaibles qualitatives
      print(self.df.describe(include='object'))

      # Comptage des modalités par variables qualitatives
      for cat in self.df :
         print(self.df[cat].value_counts())
   def encode(self, p=0.05) -> pd.DataFrame :
      data_encoded = self.df.copy()
      categorical_data = self.df.select_dtypes(include='object')
      for cat in categorical_data :
         
         columns_array = data_encoded[cat].to_numpy().reshape(-1,1)
         one_hot_encoder = OneHotEncoder(sparse_output=False).fit(columns_array)
         data_enc = one_hot_encoder.transform(columns_array)
         df_enc = pd.DataFrame(data_enc, columns = one_hot_encoder.get_feature_names_out([cat]))
         data_encoded = pd.concat([data_encoded,df_enc],axis=1)
         data_encoded = data_encoded.drop(columns=cat)

      return data_encoded


In [None]:
      
# Class Numerical Analysis

class NumericAnalysis:

   def __init__(self, numeric_data) -> None:
      self.numeric_data = numeric_data.copy()
   def boxplot(self) -> None:
      n_cols = len(self.numeric_data.columns)
      n_rows = ceil(n_cols / 3)
      
      fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5 * n_rows)) 
      axes = axes.flatten()  # Aplatir les axes pour itérer facilement
      
      for i, col in enumerate(self.numeric_data.columns):
         sns.boxplot(y=self.numeric_data[col], ax=axes[i])
         axes[i].set_title(f"{col}")
         axes[i].set_ylabel("Valeurs")
         axes[i].set_xlabel("")  # Pas d'étiquette pour l'axe X
      
      # Supprimer les axes inutilisés si le nombre de colonnes est inférieur à la grille
      for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
      
      plt.tight_layout()
      plt.show()

   def corr_matrix(self) -> None:
      sns.heatmap(data=self.numeric_data.corr())

In [None]:
def convert_to_hp(max_torq_series):
    def calculate_hp(value):
        # Vérifier si la valeur est une chaîne de caractères
        if isinstance(value, str):
            match = re.search(r"(\d+)Nm@\s*(\d+)rpm", value)
            if match:
                torque = int(match.group(1))  # Extraction du couple (Nm)
                rpm = int(match.group(2))     # Extraction du régime moteur (rpm)
                hp = (torque * rpm) / 7027    # Calcul de la puissance en chevaux
                return round(hp, 2)
        return None  # Si la valeur n'est pas valide, retourner None

    return max_torq_series.apply(calculate_hp)