# Projet ML

In [2]:
import pandas as pd
import numpy as np
import matplotlib as mp

# Dataset classification

Immunotherapy Dataset Data Set

Link:
https://archive.ics.uci.edu/ml/datasets/Immunotherapy+Dataset?fbclid=IwAR3UvWjj_2qqoS-_bjMEWCnrSUNS-1LezCZk1G6yMLHf_lAbpO4DZ76VZO0

In [3]:
#Ouvrir dataset avec read_excel()
df = pd.read_excel("Immunotherapy.xlsx")

In [4]:
#Renommer les attributs 
df.rename(
    columns={
        "sex": "Sex", 
        "age": "Age", 
        "Number_of_Warts": "Nbr_Warts",
        "induration_diameter": "Induration_diameter",
        "Result_of_Treatment": "Result", 
    },
    inplace = True
)

In [5]:
df

Unnamed: 0,Sex,Age,Time,Nbr_Warts,Type,Area,Induration_diameter,Result
0,1,22,2.25,14,3,51,50,1
1,1,15,3.00,2,3,900,70,1
2,1,16,10.50,2,1,100,25,1
3,1,27,4.50,9,3,80,30,1
4,1,20,8.00,6,1,45,8,1
...,...,...,...,...,...,...,...,...
85,1,40,5.50,8,3,69,5,1
86,1,38,7.50,8,2,56,45,1
87,1,46,11.50,4,1,91,25,0
88,1,32,12.00,9,1,43,50,0


# I- Preprocessing


# I.1- Missing values 

Dans la description du dataset de classification c'est mentionnée qu'il y a pas de missing value
et il existe une commende qui nous permet de bien détermier si oui ou non il y a des missing values qui est:

df .isna().sum()  

ici df reprèsente notre dataset de classification

In [6]:
df .isna().sum()
# comme vous pouvez le remarquer tous les valeurs obtenues sont égales à zero ,par concéquent, il n'y a pas de missing values

Sex                    0
Age                    0
Time                   0
Nbr_Warts              0
Type                   0
Area                   0
Induration_diameter    0
Result                 0
dtype: int64

In [7]:
df.dtypes

Sex                      int64
Age                      int64
Time                   float64
Nbr_Warts                int64
Type                     int64
Area                     int64
Induration_diameter      int64
Result                   int64
dtype: object

# I.2- Removing Categorical values 

On a les attribut 'Sex' et 'Type' qui sont de type Categorical, il faut les transformer au type numérique:


In [8]:
df.columns

Index(['Sex', 'Age', 'Time', 'Nbr_Warts', 'Type', 'Area',
       'Induration_diameter', 'Result'],
      dtype='object')

In [9]:
#on utilise la méthode des dummies :
dummies_Type = pd.get_dummies(df['Type'], prefix='Type', dummy_na=False)
dummies_Sex  = pd.get_dummies(df['Sex'], prefix='Sex', dummy_na=False)

In [10]:
print(dummies_Type.head(5))

   Type_1  Type_2  Type_3
0       0       0       1
1       0       0       1
2       1       0       0
3       0       0       1
4       1       0       0


In [11]:
dummies_Sex.rename(
    columns={
        "Sex_1": "Sex_M", 
        "Sex_2": "Sex_F", 
    },
    inplace = True
)

In [12]:
print(dummies_Sex.head(5))

   Sex_M  Sex_F
0      1      0
1      1      0
2      1      0
3      1      0
4      1      0


In [13]:
df = df.drop('Sex', 1)

In [14]:
df.head(5)

Unnamed: 0,Age,Time,Nbr_Warts,Type,Area,Induration_diameter,Result
0,22,2.25,14,3,51,50,1
1,15,3.0,2,3,900,70,1
2,16,10.5,2,1,100,25,1
3,27,4.5,9,3,80,30,1
4,20,8.0,6,1,45,8,1


In [15]:
df = df.drop('Type', 1)

In [16]:
df.head(5)

Unnamed: 0,Age,Time,Nbr_Warts,Area,Induration_diameter,Result
0,22,2.25,14,51,50,1
1,15,3.0,2,900,70,1
2,16,10.5,2,100,25,1
3,27,4.5,9,80,30,1
4,20,8.0,6,45,8,1


In [17]:
#We merge all the attributs 
df = pd.concat([dummies_Sex, df.Age, df.Time, df.Nbr_Warts, dummies_Type, df.Area, df.Induration_diameter, df.Result], axis=1)

In [18]:
df

Unnamed: 0,Sex_M,Sex_F,Age,Time,Nbr_Warts,Type_1,Type_2,Type_3,Area,Induration_diameter,Result
0,1,0,22,2.25,14,0,0,1,51,50,1
1,1,0,15,3.00,2,0,0,1,900,70,1
2,1,0,16,10.50,2,1,0,0,100,25,1
3,1,0,27,4.50,9,0,0,1,80,30,1
4,1,0,20,8.00,6,1,0,0,45,8,1
...,...,...,...,...,...,...,...,...,...,...,...
85,1,0,40,5.50,8,0,0,1,69,5,1
86,1,0,38,7.50,8,0,1,0,56,45,1
87,1,0,46,11.50,4,1,0,0,91,25,0
88,1,0,32,12.00,9,1,0,0,43,50,0


# I.3- Detecting Outliers


In [19]:
#Utiliser la méthode z_score pour détecter les outliers: 
def detect_outliers(data):
    outliers=[]
    threshold = 3
    mean = np.mean(data)
    std  = np.std(data)  
    
    for i in data:
        z_score = (i-mean)/std
        if np.abs(z_score)>threshold:
            outliers.append(i)
    
    return outliers

In [20]:
detect_outliers(df['Nbr_Warts'])

[19]

In [21]:
detect_outliers(df['Area'])

[900, 504, 507]

In [22]:
detect_outliers(df['Induration_diameter'])

[70, 70, 70]

In [23]:
#Les autres attributs ont été testés et ils contiennent pas de outliers
df_outliers = df[df['Nbr_Warts'] == 19].append(df[df['Area'] == 900]).append(df[df['Area'] == 504]).append(df[df['Area'] == 507]).append(df[df['Induration_diameter'] == 70])
df_outliers = df_outliers.drop_duplicates()
df_outliers = df_outliers.sort_index()
df_outliers

Unnamed: 0,Sex_M,Sex_F,Age,Time,Nbr_Warts,Type_1,Type_2,Type_3,Area,Induration_diameter,Result
1,1,0,15,3.0,2,0,0,1,900,70,1
18,0,1,15,6.5,19,1,0,0,56,7,1
31,1,0,23,3.0,2,0,0,1,87,70,1
37,1,0,29,8.75,3,1,0,0,504,2,1
61,0,1,19,2.25,8,0,1,0,42,70,1
78,1,0,43,11.0,7,1,0,0,507,7,1


In [24]:
#La suppression des outliers du Dataset df
df = df.drop(labels=[1,18,31,37,61,78], axis=0)
df.index = np.arange(0, len(df))
df

Unnamed: 0,Sex_M,Sex_F,Age,Time,Nbr_Warts,Type_1,Type_2,Type_3,Area,Induration_diameter,Result
0,1,0,22,2.25,14,0,0,1,51,50,1
1,1,0,16,10.50,2,1,0,0,100,25,1
2,1,0,27,4.50,9,0,0,1,80,30,1
3,1,0,20,8.00,6,1,0,0,45,8,1
4,1,0,15,5.00,3,0,0,1,84,7,1
...,...,...,...,...,...,...,...,...,...,...,...
79,1,0,40,5.50,8,0,0,1,69,5,1
80,1,0,38,7.50,8,0,1,0,56,45,1
81,1,0,46,11.50,4,1,0,0,91,25,0
82,1,0,32,12.00,9,1,0,0,43,50,0


# I.4- Feature scaling (normalisation or mean normalisation) la mise à l'échelle


In [25]:
from sklearn.preprocessing import MinMaxScaler #Pour rendre nos valeurs comprise entre 0 et 1
S = MinMaxScaler()
dfScale = S.fit_transform(df)
dfScale = pd.DataFrame(dfScale)
dfScale.columns = df.columns
dfScale = dfScale.astype({"Sex_M": int, "Sex_F": int, "Type_1": int, "Type_2": int, "Type_3": int, "Result" : int})
df = dfScale
df

Unnamed: 0,Sex_M,Sex_F,Age,Time,Nbr_Warts,Type_1,Type_2,Type_3,Area,Induration_diameter,Result
0,1,0,0.170732,0.113636,0.764706,0,0,1,0.090726,1.000000,1
1,1,0,0.024390,0.863636,0.058824,1,0,0,0.189516,0.479167,1
2,1,0,0.292683,0.318182,0.470588,0,0,1,0.149194,0.583333,1
3,1,0,0.121951,0.636364,0.294118,1,0,0,0.078629,0.125000,1
4,1,0,0.000000,0.363636,0.117647,0,0,1,0.157258,0.104167,1
...,...,...,...,...,...,...,...,...,...,...,...
79,1,0,0.609756,0.409091,0.411765,0,0,1,0.127016,0.062500,1
80,1,0,0.560976,0.590909,0.411765,0,1,0,0.100806,0.895833,1
81,1,0,0.756098,0.954545,0.176471,1,0,0,0.171371,0.479167,0
82,1,0,0.414634,1.000000,0.470588,1,0,0,0.074597,1.000000,0
