# Projet IA

## Regression Logistique

### Chargement des données

In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


DATA_MERGE_PATH = os.path.join("data")
def DataMerge(data_merge_path=DATA_MERGE_PATH):
    csv_path = os.path.join(data_merge_path, "DataMerge.csv")
    return pd.read_csv(csv_path) 

X = DataMerge()

X_labels = X["Attrition"].copy()
X = X.drop("Attrition", axis=1)

#remplace gender male=1 / female=0
X.Gender.replace(to_replace=dict(Male=1, Female=0), inplace=True)
 
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
 
X_num = X.select_dtypes(include=[np.number])
 
num_attribs = list(X_num)
cat_attribs = ["MaritalStatus", "JobRole", "BusinessTravel", "Department", "EducationField"]
 
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OrdinalEncoder(), cat_attribs),
    ])
 
X_prepared = full_pipeline.fit_transform(X)


#Visualisation
pd.set_option('display.max_columns', None)
X_prepared

X_full_set, X_test_set, Y_full_set, Y_test_set = train_test_split(X_prepared, X_labels, test_size=0.8, random_state=42) #SOLUTION
X_test_set

array([[-0.31108846, -0.66194603,  0.24970918, ...,  2.        ,
         1.        ,  1.        ],
       [ 0.71645169, -0.66194603, -1.56559001, ...,  1.        ,
         2.        ,  2.        ],
       [-0.1813684 ,  0.25240281, -1.56559001, ...,  2.        ,
         0.        ,  3.        ],
       ...,
       [-0.01862868, -1.57629486,  0.24970918, ...,  2.        ,
         1.        ,  5.        ],
       [-1.26787221, -1.57629486,  0.24970918, ...,  2.        ,
         1.        ,  3.        ],
       [ 0.60638618,  1.16675164, -0.65794042, ...,  2.        ,
         1.        ,  1.        ]])

### Préparation de la régression logistique

In [5]:
import math
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs', max_iter=100)
clf.fit(X_test_set, Y_test_set)
y_pred = clf.predict(X_full_set)

### Evaluation du modèle

Calcul du score de précision de notre regression logistique entre les valeurs de tests prédites et les valeurs de tests connus.

In [6]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_full_set, y_pred)
accuracy_percentage = 100 * accuracy
accuracy_percentage

85.46511627906976

On obtient une précision de **85.46%** sur notre jeu de test.

On calcul ensuite l'erreur quadratique moyenne pour comparer avec les autres algorithmes.

In [7]:
from sklearn.metrics import mean_squared_error

lr_mse = mean_squared_error(Y_full_set, y_pred)
lr_rmse = np.sqrt(lr_mse)
lr_rmse

0.3812464258315117

On obtient une erreur de 0.38 ce qui est plutôt élevé par rapport à d'autres modèles d'entrainement