# Settings

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import load_data
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import time
import psutil
import utils
import os
import csv

load_data.load_data('dataset1.csv')
X, Y = load_data.load_data('dataset1.csv')

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X: (50000, 100)
Y: (50000,)


In [2]:
results = {
    'Algorithm': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'K-Nearest Neighbors', "XGBoost"],
    'Acc. without CV': [],
    'Acc. with CV': [],
    'T. without CV [s]': [],
    'T. with CV [s]': [],
    'CPU usage without CV [%]': [],
    'CPU usage with CV [%]': [],    
}

# Unbalanced dataset verification

In [3]:
import pandas as pd

target_balance = sorted(Y)
conteggio_numeri = {}

for numero in target_balance:
    if numero in conteggio_numeri:
        conteggio_numeri[numero] += 1
    else:
        conteggio_numeri[numero] = 1

df = pd.DataFrame(list(conteggio_numeri.items()), columns=["Number", "Count"])

count_alert = any(conteggio < 1000 for conteggio in conteggio_numeri.values())

if count_alert:
    print("WARNING! Unbalanced Dataset")
else:
    print("Balanced Dataset")
    
df.head(10)

Balanced Dataset


Unnamed: 0,Number,Count
0,0,5000
1,1,5000
2,2,5000
3,3,5000
4,4,5000
5,5,5000
6,6,5000
7,7,5000
8,8,5000
9,9,5000


# Logistic Regression

In [None]:
start_time_LR = time.time()
log_reg = LogisticRegression(max_iter=10000) #default value 100
log_reg.fit(X_train, y_train)
log_reg_predictions = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
end_time_LR = time.time()
time_without_cv_LR = end_time_LR - start_time_LR
print(f"Logistic Regression Accuracy: {round(log_reg_accuracy*100,2)} %")

results['Acc. without CV'].append(round(log_reg_accuracy * 100, 2))
results['T. without CV [s]'].append(round(time_without_cv_LR, 2))
results['CPU usage without CV [%]'].append(psutil.cpu_percent())

start_time_LR_CV = time.time()
log_reg_cv_scores = cross_val_score(LogisticRegression(max_iter=10000), X, Y, cv=5)
end_time_LR_CV = time.time()
time_with_cv_LR = end_time_LR_CV - start_time_LR_CV
print(f"Logistic Regression Cross-Validation Accuracy: {np.mean(log_reg_cv_scores)*100:.2f} %")

results['Acc. with CV'].append(round(np.mean(log_reg_cv_scores) * 100, 2))
results['T. with CV [s]'].append(round(time_with_cv_LR, 2))
results['CPU usage with CV [%]'].append(psutil.cpu_percent())

Logistic Regression Accuracy: 98.65 %


### Logistic Regression - Confusion Matrix

In [None]:
log_reg_cm = confusion_matrix(y_test, log_reg_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(log_reg_cm, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig('confusion_matrix_LR.svg', format='svg')
plt.show()

# Decision Tree

In [None]:
start_time_DT = time.time()
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
dt_predictions = decision_tree.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
end_time_DT = time.time()
time_without_cv_DT = end_time_DT - start_time_DT
print(f"Decision Tree Accuracy: {round(dt_accuracy*100,2)} %")

results['Acc. without CV'].append(round(dt_accuracy * 100, 2))
results['T. without CV [s]'].append(round(time_without_cv_DT, 2))
results['CPU usage without CV [%]'].append(psutil.cpu_percent())

start_time_DT_CV = time.time()
dt_cv_scores = cross_val_score(DecisionTreeClassifier(), X, Y, cv=5)
end_time_DT_CV = time.time()
time_with_cv_DT = end_time_DT_CV - start_time_DT_CV
print(f"Decision Tree Cross-Validation Accuracy: {np.mean(dt_cv_scores)*100:.2f} %")

results['Acc. with CV'].append(round(np.mean(dt_cv_scores) * 100, 2))
results['T. with CV [s]'].append(round(time_with_cv_DT, 2))
results['CPU usage with CV [%]'].append(psutil.cpu_percent())

### Decision Tree - Confusion Matrix

In [None]:
dt_cm = confusion_matrix(y_test, dt_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(dt_cm, annot=True, cmap='Greens', fmt='d', cbar=False)
plt.title('Decision Tree Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig('confusion_matrix_DT.svg', format='svg')
plt.show()

# Random Forest

In [None]:
start_time_RF = time.time()
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
rf_predictions = random_forest.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
end_time_RF = time.time()
time_without_cv_RF = end_time_RF - start_time_RF
print(f"Random Forest Accuracy: {round(rf_accuracy*100,2)} %")

results['Acc. without CV'].append(round(rf_accuracy * 100, 2))
results['T. without CV [s]'].append(round(time_without_cv_RF, 2))
results['CPU usage without CV [%]'].append(psutil.cpu_percent())

start_time_RF_CV = time.time()
rf_cv_scores = cross_val_score(RandomForestClassifier(), X, Y, cv=5)
end_time_RF_CV = time.time()
time_with_cv_RF = end_time_RF_CV - start_time_RF_CV
print(f"Random Forest Cross-Validation Accuracy: {np.mean(rf_cv_scores)*100:.2f} %")

results['Acc. with CV'].append(round(np.mean(rf_cv_scores) * 100, 2))
results['T. with CV [s]'].append(round(time_with_cv_RF, 2))
results['CPU usage with CV [%]'].append(psutil.cpu_percent())

### Random Forest - Confusion Matrix

In [None]:
rf_cm = confusion_matrix(y_test, rf_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, cmap='Reds', fmt='d', cbar=False)
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig('confusion_matrix_RF.svg', format='svg')
plt.show()

# Support Vector Machine

In [None]:
start_time_SVM = time.time()
svm = SVC()
svm.fit(X_train, y_train)
svm_predictions = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
end_time_SVM = time.time()
time_without_cv_SVM = end_time_SVM - start_time_SVM
print(f"SVM Accuracy: {round(svm_accuracy*100,2)} %")

results['Acc. without CV'].append(round(svm_accuracy * 100, 2))
results['T. without CV [s]'].append(round(time_without_cv_SVM, 2))
results['CPU usage without CV [%]'].append(psutil.cpu_percent())

start_time_SVM_CV = time.time()
svm_cv_scores = cross_val_score(SVC(), X, Y, cv=5)
end_time_SVM_CV = time.time()
time_with_cv_SVM = end_time_SVM_CV - start_time_SVM_CV
print(f"SVM Cross-Validation Accuracy: {np.mean(svm_cv_scores)*100:.2f} %")

results['Acc. with CV'].append(round(np.mean(svm_cv_scores) * 100, 2))
results['T. with CV [s]'].append(round(time_with_cv_SVM, 2))
results['CPU usage with CV [%]'].append(psutil.cpu_percent())

### Support Vector Machine - Confusion Matrix

In [None]:
svm_cm = confusion_matrix(y_test, svm_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(svm_cm, annot=True, cmap='Purples', fmt='d', cbar=False)
plt.title('SVM Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig('confusion_matrix_SVM.svg', format='svg')
plt.show()

# K-Nearest Neighbors

In [None]:
best_knn_params = {'n_neighbors': 13, 'weights': 'uniform'}

start_time_KNN = time.time()
knn = KNeighborsClassifier(**best_knn_params)
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
end_time_KNN = time.time()
time_without_cv_KNN = end_time_KNN - start_time_KNN
print(f"K-Nearest Neighbors Accuracy: {round(knn_accuracy*100,2)} %")

results['Acc. without CV'].append(round(knn_accuracy * 100, 2))
results['T. without CV [s]'].append(round(time_without_cv_KNN, 2))
results['CPU usage without CV [%]'].append(psutil.cpu_percent())

start_time_KNN_CV = time.time()
knn_cv_scores = cross_val_score(KNeighborsClassifier(**best_knn_params), X, Y, cv=5)
end_time_KNN_CV = time.time()
time_with_cv_KNN = end_time_KNN_CV - start_time_KNN_CV
print(f"K-Nearest Neighbors Cross-Validation Accuracy: {np.mean(knn_cv_scores)*100:.2f} %")

results['Acc. with CV'].append(round(np.mean(knn_cv_scores) * 100, 2))
results['T. with CV [s]'].append(round(time_with_cv_KNN, 2))
results['CPU usage with CV [%]'].append(psutil.cpu_percent())

### K-Nearest Neighbors - Confusion Matrix

In [None]:
knn_cm = confusion_matrix(y_test, knn_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(knn_cm, annot=True, cmap='Oranges', fmt='d', cbar=False)
plt.title('K-Nearest Neighbors Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig('confusion_matrix_KNN.svg', format='svg')
plt.show()

# XGBoost

In [None]:
start_time_XGB = time.time()
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
end_time_XGB = time.time()
time_without_cv_XGB = end_time_XGB - start_time_XGB
print(f"XGBoost Accuracy: {round(xgb_accuracy*100,2)} %")

results['Acc. without CV'].append(round(xgb_accuracy * 100, 2))
results['T. without CV [s]'].append(round(time_without_cv_XGB, 2))
results['CPU usage without CV [%]'].append(psutil.cpu_percent())

start_time_XGB_CV = time.time()
xgb_cv_scores = cross_val_score(XGBClassifier(), X, Y, cv=5)
end_time_XGB_CV = time.time()
time_with_cv_XGB = end_time_XGB_CV - start_time_XGB_CV
print(f"XGBoost Cross-Validation Accuracy: {np.mean(xgb_cv_scores)*100:.2f} %")

results['Acc. with CV'].append(round(np.mean(xgb_cv_scores) * 100, 2))
results['T. with CV [s]'].append(round(time_with_cv_XGB, 2))
results['CPU usage with CV [%]'].append(psutil.cpu_percent())

### XGBoost - Confusion Matrix 

In [None]:
xgb_cm = confusion_matrix(y_test, xgb_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(xgb_cm, annot=True, cmap='YlOrBr', fmt='d', cbar=False)
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig('confusion_matrix_XGB.svg', format='svg')
plt.show()

# Results Table

In [None]:
df_results = pd.DataFrame(results)
df_results.head(10)

# Prediction - Logistic Regression

### Logistic Regression without Cross - Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_LR = []

for feature in X_blind_transformed:
    pred = log_reg.predict([feature])
    pred_list_LR.append(pred)
    flat_list_LR = [num for sublist in pred_list_LR for num in sublist]

data = {
    'Output predicted': flat_list_LR,
}

df_pre_LR = pd.DataFrame(data)
df_pre_LR.head(10000)

### Logistic Regression with Cross - Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_LRCV = []

best_model_index = np.argmax(log_reg_cv_scores)
best_log_reg_model = LogisticRegression(max_iter=10000)
best_log_reg_model.fit(X, Y)

for feature in X_blind_transformed:
    pred = best_log_reg_model.predict([feature])
    pred_list_LRCV.append(pred)
    flat_list_LRCV = [num for sublist in pred_list_LRCV for num in sublist]
    
data = {
    'Output predicted': flat_list_LRCV,
}

df_pre_LRCV = pd.DataFrame(data)
df_pre_LRCV.head(10000)

# Prediction - Decision Tree

### Decision Tree without Cross - Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_DT = []

for feature in X_blind_transformed:
    pred = decision_tree.predict([feature])
    pred_list_DT.append(pred)
    flat_list_DT = [num for sublist in pred_list_DT for num in sublist]

data = {
    'Output predicted': flat_list_DT,
}

df_pre_DT = pd.DataFrame(data)
df_pre_DT.head(10000)

### Decision Tree with Cross - Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_DTCV = []

best_model_index = np.argmax(dt_cv_scores)
best_dt_model = DecisionTreeClassifier()
best_dt_model.fit(X, Y)

for feature in X_blind_transformed:
    pred = best_dt_model.predict([feature])
    pred_list_DTCV.append(pred)
    flat_list_DTCV = [num for sublist in pred_list_DTCV for num in sublist]
    
data = {
    'Output predicted': flat_list_DTCV,
}

df_pre_DTCV = pd.DataFrame(data)
df_pre_DTCV.head(10000)

# Prediction - Random Forest

### Random Forest without Cross - Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_RF = []

for feature in X_blind_transformed:
    pred = random_forest.predict([feature])
    pred_list_RF.append(pred)
    flat_list_RF = [num for sublist in pred_list_RF for num in sublist]
    
data = {
    'Output predicted': flat_list_RF,
}

df_pre_RF = pd.DataFrame(data)
df_pre_RF.head(10000)

### Random Forest with Cross - Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_RFCV = []

best_model_index = np.argmax(rf_cv_scores)
best_rf_model = RandomForestClassifier()
best_rf_model.fit(X, Y)

for feature in X_blind_transformed:
    pred = best_rf_model.predict([feature])
    pred_list_RFCV.append(pred)
    flat_list_RFCV = [num for sublist in pred_list_RFCV for num in sublist]
    
data = {
    'Output predicted': flat_list_RFCV,
}

df_pre_RFCV = pd.DataFrame(data)
df_pre_RFCV.head(10000)

# Prediction - Support Vector Machine

### Support Vector Machine without Cross-Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_SVM = []

for feature in X_blind_transformed:
    pred = svm.predict([feature])
    pred_list_SVM.append(pred)
    flat_list_SVM = [num for sublist in pred_list_SVM for num in sublist]
    
data = {
    'Output predicted': flat_list_SVM,
}

df_pre_SVM = pd.DataFrame(data)
df_pre_SVM.head(10000)

### Support Vector Machine with Cross-Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_SVMCV = []

best_model_index = np.argmax(svm_cv_scores)
best_svm_model = SVC()
best_svm_model.fit(X, Y)

for feature in X_blind_transformed:
    pred = best_svm_model.predict([feature])
    pred_list_SVMCV.append(pred)
    flat_list_SVMCV = [num for sublist in pred_list_SVMCV for num in sublist]
    
data = {
    'Output predicted': flat_list_SVMCV,
}

df_pre_SVMCV = pd.DataFrame(data)
df_pre_SVMCV.head(10000)

# Prediction - K-Nearest Neighbors

### K-Nearest Neighbors without Cross-Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_KNN = []

for feature in X_blind_transformed:
    pred = knn.predict([feature])
    pred_list_KNN.append(pred)
    flat_list_KNN = [num for sublist in pred_list_KNN for num in sublist]
    
data = {
    'Output predicted': flat_list_KNN,
}

df_pre_KNN = pd.DataFrame(data)
df_pre_KNN.head(10000)

### K-Nearest Neighbors with Cross-Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_KNNCV = []

best_model_index = np.argmax(knn_cv_scores)
best_knn_model = KNeighborsClassifier(**best_knn_params)
best_knn_model.fit(X, Y)

for feature in X_blind_transformed:
    pred = best_knn_model.predict([feature])
    pred_list_KNNCV.append(pred)
    flat_list_KNNCV = [num for sublist in pred_list_KNNCV for num in sublist]

data = {
    'Output predicted': flat_list_KNNCV,
}

df_pre_KNNCV = pd.DataFrame(data)
df_pre_KNNCV.head(10000)

# Prediction - XGB

### XGB without Cross-Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_XGB = []

for feature in X_blind_transformed:
    pred = xgb.predict([feature])
    pred_list_XGB.append(pred)
    flat_list_XGB = [num for sublist in pred_list_XGB for num in sublist]
    
data = {
    'Output predicted': flat_list_XGB,
}

df_pre_XGB = pd.DataFrame(data)
df_pre_XGB.head(10000)

### XGB with Cross-Validation

In [None]:
X_blind = load_data.load_data("blind_test1.csv")[0]
X_blind_transformed = scaler.transform(X_blind)
pred_list_XGBCV = []

best_model_index = np.argmax(xgb_cv_scores)
best_xgb_model = XGBClassifier()
best_xgb_model.fit(X, Y)

for feature in X_blind_transformed:
    pred = best_xgb_model.predict([feature])
    pred_list_XGBCV.append(pred)
    flat_list_XGBCV = [num for sublist in pred_list_XGBCV for num in sublist]
    
data = {
    'Output predicted': flat_list_XGBCV,
}

df_pre_XGBCV = pd.DataFrame(data)
df_pre_XGBCV.head(10000)

# Export results to CSV - Blind_test1.csv

### Official Prediction - MLEx1 Blind_test.csv

### Predictions Table

In [None]:
predictions_df = pd.DataFrame({
    'Logistic Regression': flat_list_LR,
    'Decision Tree': flat_list_DT,
    'Random Forest': flat_list_RF,
    'SVM': flat_list_SVM,
    'K-Nearest Neighbors': flat_list_KNN,
    'XGB': flat_list_XGB,
    'Logistic Regression CV': flat_list_LRCV,
    'Decision Tree CV': flat_list_DTCV,
    'Random Forest CV': flat_list_RFCV,
    'SVM CV': flat_list_SVMCV,
    'K-Nearest Neighbors CV': flat_list_KNNCV,
    'XGB CV': flat_list_XGBCV
})

predictions_df.to_excel('predictions_blind_test1.xlsx', index=True)

### Comparison of algorithms with and without Cross - Validation

In [None]:
counter_LR = 0
counter_DT = 0
counter_RF = 0
counter_SVM = 0
counter_KNN = 0
counter_XGB = 0

for i1, j1 in zip(flat_list_LR, flat_list_LRCV):
    if i1 == j1:
        counter_LR += 1
    else:
        counter_LR += 0

for i2, j2 in zip(flat_list_DT, flat_list_DTCV):
    if i2 == j2:
        counter_DT += 1
    else:
        counter_DT += 0

for i3, j3 in zip(flat_list_RF, flat_list_RFCV):
    if i3 == j3:
        counter_RF += 1
    else:
        counter_RF += 0

for i4, j4 in zip(flat_list_SVM, flat_list_SVMCV):
    if i4 == j4:
        counter_SVM += 1
    else:
        counter_SVM += 0

for i5, j5 in zip(flat_list_KNN, flat_list_KNNCV):
    if i5 == j5:
        counter_KNN += 1
    else:
        counter_KNN += 0
        
for i6, j6 in zip(flat_list_XGB, flat_list_XGBCV):
    if i6 == j6:
        counter_XGB += 1
    else:
        counter_XGB += 0
        
data = {
    'Modello': ['LR', 'DT', 'RF', 'SVM', 'KNN', 'XGB'],
    'Equal results [%]': [counter_LR/100, counter_DT/100, counter_RF/100, counter_SVM/100, counter_KNN/100, counter_XGB/100]
}

df_exp_acc = pd.DataFrame(data)
df_exp_acc.head(6)