In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
from tqdm.notebook import *

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [15]:
df=pd.read_csv("mars-2014-complete.csv",encoding='ISO-8859-1', sep = ';')


In [16]:
# Dimensions du DataFrame
print("Dimensions (lignes, colonnes) :", df.shape)

Dimensions (lignes, colonnes) : (55044, 30)


In [17]:
pd.set_option('display.max_info_columns', 100)  # Augmenter la limite pour df.info()
pd.set_option('display.max_columns', None)      # Forcer l'affichage des colonnes pour d'autres vues
pd.set_option('display.width', 1000)            # Éviter les coupures horizontales

# Appeler df.info()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55044 entries, 0 to 55043
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   lib_mrq            55044 non-null  object 
 1   lib_mod_doss       55044 non-null  object 
 2   lib_mod            55044 non-null  object 
 3   dscom              55044 non-null  object 
 4   cnit               55044 non-null  object 
 5   tvv                55044 non-null  object 
 6   cod_cbr            55044 non-null  object 
 7   hybride            55044 non-null  object 
 8   puiss_admin_98     55044 non-null  int64  
 9   puiss_max          55044 non-null  object 
 10  typ_boite_nb_rapp  55044 non-null  object 
 11  conso_urb          55001 non-null  object 
 12  conso_exurb        55001 non-null  object 
 13  conso_mixte        55010 non-null  object 
 14  co2                55010 non-null  float64
 15  co_typ_1           54886 non-null  object 
 16  hc                 977

In [36]:
résumé = pd.DataFrame({
    "Type": df.dtypes,
    "Valeurs Manquantes": df.isnull().sum(),
    "Pourcentage Manquant (%)": (df.isnull().sum() / len(df)) * 100
})
print(résumé)

                      Type  Valeurs Manquantes  Pourcentage Manquant (%)
lib_mrq             object                   0                  0.000000
lib_mod_doss        object                   0                  0.000000
lib_mod             object                   0                  0.000000
dscom               object                   0                  0.000000
cnit                object                   0                  0.000000
tvv                 object                   0                  0.000000
cod_cbr             object                   0                  0.000000
hybride             object                   0                  0.000000
puiss_admin_98       int64                   0                  0.000000
puiss_max           object                   0                  0.000000
typ_boite_nb_rapp   object                   0                  0.000000
conso_urb           object                  43                  0.078119
conso_exurb         object                  43     

In [40]:
print(df['gamme'].isnull().sum() / len(df) * 100)
print(df['gamme'].dtypes)

0.0
object


In [47]:
colonnes_object = df.select_dtypes(include='object')
categories_uniques = colonnes_object.nunique()
print(categories_uniques)


lib_mrq                 46
lib_mod_doss           483
lib_mod                434
dscom                 3837
cnit                 54982
tvv                  35430
cod_cbr                 13
hybride                  2
puiss_max              232
typ_boite_nb_rapp       19
conso_urb              198
conso_exurb             89
conso_mixte            137
co_typ_1               606
hc                      74
nox                    222
hcnox                  209
ptcl                     9
champ_v9                35
date_maj                 5
Carrosserie             11
gamme                    7
dtype: int64


In [46]:
# Sélectionner toutes les colonnes de type `object` ou `category`
colonnes_cat = df.select_dtypes(include=['object', 'category'])

# Afficher les valeurs uniques pour chaque colonne catégorielle
for col in colonnes_cat.columns:
    print(f"Valeurs uniques pour {col}:")
    print(df[col].unique())
    print()


Valeurs uniques pour lib_mrq:
['ALFA-ROMEO' 'ASTON MARTIN' 'AUDI' 'BENTLEY' 'BMW' 'CADILLAC' 'CHEVROLET'
 'CITROEN' 'DACIA' 'FERRARI' 'FIAT' 'FORD' 'HONDA' 'HYUNDAI' 'INFINITI'
 'JAGUAR' 'JEEP' 'KIA' 'LADA' 'LAMBORGHINI' 'LANCIA' 'LAND ROVER' 'LEXUS'
 'LOTUS' 'MASERATI' 'MAZDA' 'MERCEDES' 'MIA' 'MINI' 'MITSUBISHI' 'NISSAN'
 'OPEL' 'PEUGEOT' 'PORSCHE' 'RENAULT' 'ROLLS-ROYCE' 'SEAT' 'SKODA' 'SMART'
 'SSANGYONG' 'SUBARU' 'SUZUKI' 'TESLA' 'TOYOTA' 'VOLKSWAGEN' 'VOLVO']

Valeurs uniques pour lib_mod_doss:
['159' '4C' 'AR8C SPIDER' 'BRERA' 'GIULIETTA' 'MITO' 'SPIDER' 'CYGNET'
 'DB9' 'DB9 VOLANTE' 'ONE-77' 'RAPIDE S' 'V12 VANTAGE'
 'V12 VANTAGE ROADSTER' 'V12 ZAGATO' 'V8 VANTAGE' 'V8 VANTAGE ROADSTER'
 'VANQUISH' 'VANTAGE' 'A1' 'A1 QUATTRO' 'A1 SPORTBACK' 'A3' 'A3 CABRIOLET'
 'A3 LIMOUSINE' 'A3 SPORTBACK' 'A4 ALLROAD QUATTRO' 'A4 AVANT'
 'A4 LIMOUSINE' 'A5 CABRIOLET' 'A5 COUPE' 'A5 SPORTBACK'
 'A6 ALLROAD QUATTRO' 'A6 AVANT' 'A6 LIMOUSINE' 'A6 LIMOUSINE HYBRID'
 'A7 SPORTBACK' 'A8' 'A8L' 'AUD