# Projet IA.
## Livrable code


|Auteur|Modification|
|---|---|
|Majorel Pierre|2023/03/13|
|Alami Ouali Othmane|2023/03/13|
|Lopez Daniel|2023/03/13|

### Préparation de l'environnement

Ci-dessous quelques imports et précautions préalables à notre travail.

In [1]:
!pip install numpy
!pip install pandas
!pip install sklearn



In [2]:
# import
import numpy as np
import os

from numpy.random import default_rng
# stabilité du notebook d'une exécution à l'autre
random=default_rng(42) 

# jolies figures directement dans le notebook
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# où sauver les figures
PROJECT_ROOT_DIR = r"." #changez ce chemin en fonction de votre dossier
CHAPTER_ID = "Images_Projet"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) # le dossier doit exister

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Import des données

Il convient d'automatiser l'import des données. On va implémenter une fonction qui se charge de :
* télécharger l'archive
* extraire les fichiers

In [3]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://github.com/Daniel-Lopez65/Projet_IA_Cesi/blob/main/"
DONNEES_PATH = PROJECT_ROOT_DIR
DONNEES_URL = DOWNLOAD_ROOT + "Donnees_triees.tgz" 

def fetch_donnees_data(donnees_url=DONNEES_URL, donnees_path=DONNEES_PATH): 
    if not os.path.isdir(donnees_path): 
        os.makedirs(donnees_path) 
    
    tgz_path = os.path.join(donnees_path, "Donnees_triees.tgz") 
    urllib.request.urlretrieve(donnees_url, tgz_path) 
    donnees_tgz = tarfile.open(tgz_path) 
    donnees_tgz.extractall(path=donnees_path) 
    donnees_tgz.close() 

On peut maintenant importer les données :

In [4]:
fetch_donnees_data()

ReadError: file could not be opened successfully

### Chargement des données en mémoire

In [None]:
###########TOUT A MODIFIER###########
import pandas as pd

def load_donnees_data(donnees_path=DONNEES_PATH):
    csv_path = os.path.join(donnees_path, "donnees_triees.csv")
    return pd.read_csv(csv_path)

donnees = load_donnees_data()
###########TOUT A MODIFIER###########

### Visualization des données

Nous affichons un histogramme par attribut numérique

In [None]:
donnees.hist(bins=50, figsize=(20,15))

### Séparation du jeu de donnée

Ici nous allons couper notre jeu de données en deux 
* 1) Le train set
* 2) Le test set

In [None]:
###########TOUT A MODIFIER###########
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

donnees["income_cat"] = np.ceil(housing["median_income"]/1.5)
donnees["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True) # replace where false

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.copy()
###########TOUT A MODIFIER###########

### Création de la pipeline

In [None]:
###########TOUT A MODIFIER###########
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num = housing.select_dtypes(include=[np.number]) 
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
###########TOUT A MODIFIER###########

### Utilisation du modèle X

### Indice de qualité du modèle X

#### Précision et recall

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) # A changer

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): # A changer
    plt.plot(thresholds, precisions[:-1], "b-", label="Precision", linewidth=2) # A changer
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2) # A changer
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds) # A changer
plt.xlim([-700000, 700000])
save_fig("precision_recall_vs_threshold_plot")
plt.show()

def plot_precision_vs_recall(precisions, recalls): # A changer
    plt.plot(recalls, precisions, "k-", linewidth=2) # A changer
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls) # A changer
save_fig("precision_vs_recall_plot")
plt.show()

#### Courbe ROC

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) # A changer

def plot_roc_curve(fpr, tpr, label=None): # A changer
    plt.plot(fpr, tpr, linewidth=2, label=label) # A changer
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr) # A changer
save_fig("roc_curve_plot")
plt.show()

### Cross validation pour tester le modèle X

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, # A changer
                         scoring="neg_mean_squared_error", cv=10) # A changer
tree_rmse_scores = np.sqrt(-scores) # A changer

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

### Utilisation du modèle Y

### Indice de qualité du modèle Y

#### Précision et recall

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) # A changer

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): # A changer
    plt.plot(thresholds, precisions[:-1], "b-", label="Precision", linewidth=2) # A changer
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2) # A changer
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds) # A changer
plt.xlim([-700000, 700000])
save_fig("precision_recall_vs_threshold_plot")
plt.show()

def plot_precision_vs_recall(precisions, recalls): # A changer
    plt.plot(recalls, precisions, "k-", linewidth=2) # A changer
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls) # A changer
save_fig("precision_vs_recall_plot")
plt.show()

#### Courbe ROC

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) # A changer

def plot_roc_curve(fpr, tpr, label=None): # A changer
    plt.plot(fpr, tpr, linewidth=2, label=label) # A changer
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr) # A changer
save_fig("roc_curve_plot")
plt.show()

### Cross validation pour tester le modèle Y

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, # A changer
                         scoring="neg_mean_squared_error", cv=10) # A changer
tree_rmse_scores = np.sqrt(-scores) # A changer

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

### Utilisation du modèle N

### Indice de qualité du modèle N

#### Précision et recall

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) # A changer

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): # A changer
    plt.plot(thresholds, precisions[:-1], "b-", label="Precision", linewidth=2) # A changer
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2) # A changer
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds) # A changer
plt.xlim([-700000, 700000])
save_fig("precision_recall_vs_threshold_plot")
plt.show()

def plot_precision_vs_recall(precisions, recalls): # A changer
    plt.plot(recalls, precisions, "k-", linewidth=2) # A changer
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls) # A changer
save_fig("precision_vs_recall_plot")
plt.show()

#### Courbe ROC

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) # A changer

def plot_roc_curve(fpr, tpr, label=None): # A changer
    plt.plot(fpr, tpr, linewidth=2, label=label) # A changer
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr) # A changer
save_fig("roc_curve_plot")
plt.show()

### Cross validation pour tester le modèle N

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, # A changer
                         scoring="neg_mean_squared_error", cv=10) # A changer
tree_rmse_scores = np.sqrt(-scores) # A changer

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

### Séléction du meilleur modèle

#### Courbe ROC de comparaison

Afin de choisir un modèle, nous allons utiliser les différentes courbes ROC réalisées précédement et les comparer.
Le modèle ayant la meilleure courbe sera celui que nous séléctionnerons pour la suite du projet.

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD") #A modifier
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest") #A modifier
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()

### Entrainement sur le test set

In [None]:
###########TOUT A MODIFIER###########
final_model = random_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
###########TOUT A MODIFIER###########

### Utilistation du modèle pour les prédictions

In [None]:
###########TOUT A MODIFIER###########
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_rmse = mean_squared_error(y_test, final_predictions, squared = False)
final_rmse
###########TOUT A MODIFIER###########

#### Conclusion