In [28]:
# Importation des bibliothèques Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Importation des données
data = pd.read_csv('data/Social_Network_Ads.csv')
# Dimension du dataset
data.shape

(400, 5)

In [29]:
# Preview du dataset
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [30]:
# Sommaire du dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [31]:
# Recherche des variables catégorielles du dataset
categorical = [var for var in data.columns if data[var].dtype=='O']
print('Il y a {} variable(s) catégorielles.'.format(len(categorical)))
print('Les variables catégorielles sont : {}'.format(categorical))

Il y a 1 variable(s) catégorielles.
Les variables catégorielles sont : ['Gender']


In [32]:
# Affichage des variables catégorielles
data[categorical].head

<bound method NDFrame.head of      Gender
0      Male
1      Male
2    Female
3    Female
4      Male
..      ...
395  Female
396    Male
397  Female
398    Male
399  Female

[400 rows x 1 columns]>

In [33]:
# vérification des valeurs manquantes dans les variables catégorielles
data[categorical].isnull().sum()

Gender    0
dtype: int64

In [34]:
# affichage du nombre de fréquences des valeurs dans les variables catégorielles
for var in categorical:    
    print(data[var].value_counts())

Female    204
Male      196
Name: Gender, dtype: int64


In [35]:
# affichage de la distribution de fréquence des variables catégorielles
for var in categorical:    
    print(data[var].value_counts()/np.float(len(data)))

Female    0.51
Male      0.49
Name: Gender, dtype: float64


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  print(data[var].value_counts()/np.float(len(data)))


In [36]:
# Recherche des variables numériques
numerical = [var for var in data.columns if data[var].dtype!='O']
print('Il y a {} variables numériques.'.format(len(numerical)))
print('Les variables numériques sont : {}'.format(numerical))


Il y a 4 variables numériques.
Les variables numériques sont : ['User ID', 'Age', 'EstimatedSalary', 'Purchased']


In [37]:
# affichage des variables numériques
data[numerical].head()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
0,15624510,19,19000,0
1,15810944,35,20000,0
2,15668575,26,43000,0
3,15603246,27,57000,0
4,15804002,19,76000,0


In [38]:
# vérification des variables numériques pour voir si certaines sont nulles
data[numerical].isnull().sum()

User ID            0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [39]:
# division des données 
X = data.drop(["Purchased"], axis = 1)
y = data["Purchased"]

In [40]:
# Séparation de X et y en training et testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [41]:
# check the shape of X_train and X_test
X_train.shape, X_test.shape

((280, 4), (120, 4))

In [42]:
# check data types in X_train
X_train.dtypes

User ID             int64
Gender             object
Age                 int64
EstimatedSalary     int64
dtype: object

In [43]:
# Affichage des variables catégorielles
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']
categorical

['Gender']

In [44]:
# Affichage des variables numériques
numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']
numerical

['User ID', 'Age', 'EstimatedSalary']

In [45]:
# affichage du pourcentage de valeurs manquantes dans les variables catégorielles de l'ensemble d'apprentissage
X_train[categorical].isnull().mean()

Gender    0.0
dtype: float64

In [46]:
# affichage des variables catégorielles avec des données manquantes
for col in categorical:
    if X_train[col].isnull().mean()>0:
        print(col, (X_train[col].isnull().mean()))
    else:
        print("{} ne contient pas de valeur nulle.".format(col))

Gender ne contient pas de valeur nulle.


In [47]:
# vérification des valeurs manquantes dans les variables catégorielles dans X_train
X_train[categorical].isnull().sum()

Gender    0
dtype: int64

In [48]:
# vérification des valeurs manquantes dans les variables catégorielles dans X_test
X_test[categorical].isnull().sum()

Gender    0
dtype: int64

In [49]:
# vérification d'absence de données dans X_train
X_train.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
dtype: int64

In [50]:
# vérification d'absence de données dans X_test
X_test.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
dtype: int64

In [51]:
# Affichage des variables catégorielles
categorical

['Gender']

In [52]:
X_train[categorical].head()

Unnamed: 0,Gender
92,Male
223,Male
234,Female
232,Male
377,Female


In [57]:
# Encodage des variables
import category_encoders as ce

encoder = ce.OneHotEncoder(cols=['User ID', 'Gender', 'Age', 'EstimatedSalary'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [58]:
X_train.head()

Unnamed: 0,User ID_1,User ID_2,User ID_3,User ID_4,User ID_5,User ID_6,User ID_7,User ID_8,User ID_9,User ID_10,...,EstimatedSalary_100,EstimatedSalary_101,EstimatedSalary_102,EstimatedSalary_103,EstimatedSalary_104,EstimatedSalary_105,EstimatedSalary_106,EstimatedSalary_107,EstimatedSalary_108,EstimatedSalary_109
92,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
234,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
232,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
377,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
X_train.shape


(280, 434)

In [60]:
X_test.head()

Unnamed: 0,User ID_1,User ID_2,User ID_3,User ID_4,User ID_5,User ID_6,User ID_7,User ID_8,User ID_9,User ID_10,...,EstimatedSalary_100,EstimatedSalary_101,EstimatedSalary_102,EstimatedSalary_103,EstimatedSalary_104,EstimatedSalary_105,EstimatedSalary_106,EstimatedSalary_107,EstimatedSalary_108,EstimatedSalary_109
132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
309,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
341,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [61]:
X_test.shape

(120, 434)

In [62]:
cols = X_train.columns

In [63]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])
X_train.head()

Unnamed: 0,User ID_1,User ID_2,User ID_3,User ID_4,User ID_5,User ID_6,User ID_7,User ID_8,User ID_9,User ID_10,...,EstimatedSalary_100,EstimatedSalary_101,EstimatedSalary_102,EstimatedSalary_103,EstimatedSalary_104,EstimatedSalary_105,EstimatedSalary_106,EstimatedSalary_107,EstimatedSalary_108,EstimatedSalary_109
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB

# instantiate the model
gnb = GaussianNB()

# fit the model
gnb.fit(X_train, y_train)



GaussianNB()