# Entraînement 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import json

# Charger les données depuis le fichier CSV
file_path= "O:/Desktop/LSI/LSI_S5/BigDATA/projet/out_file.txt"

# Lire les données JSON depuis le fichier
with open(file_path, 'r') as file:
    data_json = [json.loads(line) for line in file]

# Créer un DataFrame à partir des données JSON
data= pd.DataFrame(data_json)


In [2]:
data.head()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Appliquer le LabelEncoder à la colonne "Location"
data['Location'] = label_encoder.fit_transform(data['Location'])
data['Company'] = label_encoder.fit_transform(data['Company'])
data['Names'] = label_encoder.fit_transform(data['Names'])
data['Onboard_date'] = label_encoder.fit_transform(data['Onboard_date'])

In [4]:
data.head()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,127,42.0,11066.8,0,7.22,8.0,634,81,325,1
1,519,41.0,11916.22,0,6.5,11.0,631,487,849,1
2,285,38.0,12884.75,0,6.67,12.0,871,114,499,1
3,698,42.0,8010.76,0,6.71,10.0,691,110,704,1
4,193,37.0,9191.58,0,5.56,9.0,838,607,458,1


In [5]:
# Diviser les données en features (X) et labels (y)
X = data.drop('Churn', axis=1)  
y = data['Churn']

In [6]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Normaliser des données 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Initialiser les classificateurs
rf_classifier = RandomForestClassifier()
svm_classifier = SVC()
lr_classifier = LogisticRegression()

# Entraîner les classificateurs
rf_classifier.fit(X_train_scaled, y_train)
svm_classifier.fit(X_train_scaled, y_train)
lr_classifier.fit(X_train_scaled, y_train)

# Faire des prédictions sur les ensembles de test
rf_predictions = rf_classifier.predict(X_test_scaled)
svm_predictions = svm_classifier.predict(X_test_scaled)
lr_predictions = lr_classifier.predict(X_test_scaled)

# Évaluer les classificateurs
rf_accuracy = accuracy_score(y_test, rf_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)
lr_accuracy = accuracy_score(y_test, lr_predictions)

print(rf_accuracy)
print(svm_accuracy)
print(lr_accuracy)


0.8555555555555555
0.8611111111111112
0.9055555555555556


In [9]:

# Sélectionner le meilleur classificateur
best_classifier = max([(rf_accuracy, 'Random Forest'), (svm_accuracy, 'SVM'), (lr_accuracy, 'Logistic Regression')])

print(f"Le meilleur classificateur est {best_classifier[1]} avec une précision de {best_classifier[0]:.2f}")

# Sauvegarder le meilleur classificateur
if best_classifier[1] == 'Random Forest':
    joblib.dump(rf_classifier, 'O:/Desktop/LSI/LSI_S5/BigDATA/projet/meilleur_modele.pkl')
elif best_classifier[1] == 'SVM':
    joblib.dump(svm_classifier, 'O:/Desktop/LSI/LSI_S5/BigDATA/projet/meilleur_modele.pkl')
else:
    joblib.dump(lr_classifier, 'O:/Desktop/LSI/LSI_S5/BigDATA/projet/meilleur_modele.pkl')

Le meilleur classificateur est Logistic Regression avec une précision de 0.91


# Prédiction

In [16]:
# Charger les données depuis le fichier CSV
file_path= "O:/Desktop/LSI/LSI_S5/BigDATA/projet/test_data.txt"

# Lire les données JSON depuis le fichier
with open(file_path, 'r') as file:
    data_json = [json.loads(line) for line in file]

# Créer un DataFrame à partir des données JSON
data_test= pd.DataFrame(data_json)

data_test

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company
0,Andrew Mccall,37,9935.53,1,7.71,8,2011-08-29 18:37:54,"38612 Johnny Stravenue Nataliebury, WI 15717-8316",King Ltd
1,Michele Wright,23,7526.94,1,9.28,15,2013-07-22 18:19:54,"21083 Nicole Junction Suite 332, Youngport, ME...",Cannon-Benson


In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Appliquer le LabelEncoder à la colonne "Location"
data_test['Location'] = label_encoder.fit_transform(data_test['Location'])
data_test['Company'] = label_encoder.fit_transform(data_test['Company'])
data_test['Names'] = label_encoder.fit_transform(data_test['Names'])
data_test['Onboard_date'] = label_encoder.fit_transform(data_test['Onboard_date'])

In [14]:
data_test.head()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company
0,0,37,9935.53,1,7.71,8,0,1,1
1,1,23,7526.94,1,9.28,15,1,0,0


In [15]:
# Faire des prédictions sur la nouvelle dataset
predictions_new = lr_classifier.predict(data_test)

# Afficher les prédictions
print(predictions_new)

[1 1]


