# PRE-PROCESSING et MACHINE LEARNING

## Importer les librairies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv("data_clean.csv", index_col=0)
data.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,test_note_JVC,avis_count_JVC,avis_note_JVC,Classification_Age_JVC,Support_JVC
0,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,14.0,687.0,13.8,7.0,DVD
1,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,19.0,3.0,19.0,3.0,eShop Console Virtuelle Cartouche
2,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,16.0,3023.0,17.4,3.0,DVD
3,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,15.0,581.0,16.5,3.0,DVD
6,New Super Mario Bros.,DS,2006,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01,17.0,1370.0,17.4,3.0,Cartouche


In [3]:
def extraire_premier_mot(chaine):
    mots = chaine.split()
    return mots[0]

# Appliquer la fonction à chaque élément de la colonne 'chaine' en utilisant la méthode apply()
data['Support_JVC']=data['Support_JVC'].apply(extraire_premier_mot)

In [4]:
df1 = data.copy()
pd.set_option('display.max_row',111) # permet d'afficher les infos en entier sur les lignes
pd.set_option('display.max_column',111) # permet d'afficher les infos en entier sur les colonnes

## regression linéaire

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X=data.drop('Global_Sales', axis=1)
target=data['Global_Sales']

X_train, X_test, y_train, y_test = train_test_split(data, target,  test_size=0.2, random_state=42)


In [None]:
# Sélectionne toutes les colonnes numériques
X_train_num = X_train.select_dtypes(include=['int64', 'float64'])
X_test_num = X_test.select_dtypes(include=['int64', 'float64'])

# Sélectionne toutes les colonnes catégorielles
X_train_cat = X_train.select_dtypes(include=['object'])
X_test_cat = X_test.select_dtypes(include=['object'])

In [8]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

cat=['Name', 'Platform', 'Genre', 'Publisher', 'Support_JVC']

X_train.loc[:,cat]=ohe.fit_transform(X_train[cat])
X_test.loc[:,cat]=ohe.transform(X_test[cat])

IndexError: tuple index out of range

In [None]:
model=LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()

oe.fit(X_train_cat)
oe.fit_transform(X_test_cat)


In [None]:
X_train=np.concatenate((X_train_num, X_train_cat))
X_test=np.concatenate((X_test_num, X_test_cat))

## Modellisation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC # utile sur dataset de 1000 à 2000 données
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA

In [None]:
model=DecisionTreeClassifier(random_state=0)

model_1 = RandomForestClassifier(random_state=0)


In [None]:
model_2 = make_pipeline(PolynomialFeatures(2), SelectKBest(f_classif, k=10),
                      RandomForestClassifier(random_state=0))

preprocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))

In [None]:
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost= make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor,StandardScaler(), SVC(random_state=0))
KNN = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())

In [None]:
dict_of_models={'RandomForest':RandomForest, 'AdaBoost':AdaBoost, 'SVM':SVM, 'KNN':KNN}

## Procédure d'évaluation

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [None]:
def evaluation(model):
    
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))
    
    N, train_score, val_score = learning_curve(model, X_train, y_train,
                                              cv=4, scoring='f1',
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    
    plt.figure(figsize=(12, 8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()

In [None]:
for name,model in dict_of_models.items():
    print(name)
    evaluation(model)