In [1]:
from platform import python_version

In [2]:
python_version()

'3.9.13'

Učitavanje neophodnih biblioteka

In [3]:
# pip install imblearn

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
data_path = "C:/Users/Andjela/Documents/Master rad/Skripte kod/"
dataset_name = "WA_Fn-UseC_-HR-Employee-Attrition.csv"

In [7]:
try:
    data = pd.read_csv(data_path+dataset_name)
    original_data = data.copy()
    print('Podaci ucitani')
except:
    print('Greska u ucitavanju podataka')

Podaci ucitani


In [8]:
data_backup=data.copy()

### ***PREPROCESSING***

Preprocesiranje u mašinskom učenju za klasifikaciju odnosi se na skup koraka transformacije i manipulacije podacima pre nego što se ti podaci unesu u algoritam za klasifikaciju. Cilj preprocesiranja je poboljšanje kvaliteta podataka, čineći ih pogodnim za odabrani algoritam za klasifikaciju, i na kraju poboljšanje performansi modela.

Većina algoritama mašinskog učenja obično zahteva numeričke vrednosti kao svoje prediktorske varijable. Zbog toga postaje neophodno primeniti Label Encoding jer on kodira kategoričke oznake numeričkim vrednostima. Kako bismo izbegli unos značajnosti karakterističnih za kategoričke osobine sa velikim brojem jedinstvenih vrednosti, 

Izbacivanje kolona koje nam nisu od interesa i koje nemaju smisla za dalju analizu

In [9]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [10]:
# naredne kolone nisu potrebne
data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

In [11]:
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EnvironmentSatisfaction   1470 non-null   int64 
 9   Gender                    1470 non-null   object
 10  HourlyRate                1470 non-null   int64 
 11  JobInvolvement            1470 non-null   int64 
 12  JobLevel                  1470 non-null   int64 
 13  JobRole                   1470 non-null   object
 14  JobSatisfaction         

### Izvedene kolone
 - u nastavku će bit dat predlog dodatnih kolona koje mogu da se izvedu na osnovu već postojećih kolona

In [13]:
# Predstavlja odnos između YearsSinceLastPromotion i YearsAtCompany. Ova varijabla može da bude korisna 
# u slučajevima kada je potrebn
data['PromotionPotential'] = data['YearsSinceLastPromotion'] / (data['YearsAtCompany'] + 1)

In [14]:
# Predstavlja odnos između TotalWorkinYears i NumCompaniesWorked. Ovo može da pruži uvid u stabilnost
#radne istorije zaposlenog
data['ExperienceRatio'] = data['TotalWorkingYears'] / (data['NumCompaniesWorked'] + 1)

Vidimo da neke kolone koje su zapravo kategoričke imaju tip podataka koji je numerički i zato ćemo izvršiti analizu jedinstvenih vrednosti u svakoj koloni i stoga cemo to kasnije uključiti kao kriterijum po kome uzimamo kategoričke kolone u našem skupu podataka.

In [15]:
data.nunique()

Age                           43
Attrition                      2
BusinessTravel                 3
DailyRate                    886
Department                     3
DistanceFromHome              29
Education                      5
EducationField                 6
EnvironmentSatisfaction        4
Gender                         2
HourlyRate                    71
JobInvolvement                 4
JobLevel                       5
JobRole                        9
JobSatisfaction                4
MaritalStatus                  3
MonthlyIncome               1349
MonthlyRate                 1427
NumCompaniesWorked            10
OverTime                       2
PercentSalaryHike             15
PerformanceRating              2
RelationshipSatisfaction       4
StockOptionLevel               4
TotalWorkingYears             40
TrainingTimesLastYear          7
WorkLifeBalance                4
YearsAtCompany                37
YearsInCurrentRole            19
YearsSinceLastPromotion       16
YearsWithC

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1470 non-null   int64  
 1   Attrition                 1470 non-null   object 
 2   BusinessTravel            1470 non-null   object 
 3   DailyRate                 1470 non-null   int64  
 4   Department                1470 non-null   object 
 5   DistanceFromHome          1470 non-null   int64  
 6   Education                 1470 non-null   int64  
 7   EducationField            1470 non-null   object 
 8   EnvironmentSatisfaction   1470 non-null   int64  
 9   Gender                    1470 non-null   object 
 10  HourlyRate                1470 non-null   int64  
 11  JobInvolvement            1470 non-null   int64  
 12  JobLevel                  1470 non-null   int64  
 13  JobRole                   1470 non-null   object 
 14  JobSatis

In [17]:
# u kodu ispod određujemo kategorijske kolone gde kao uslov uzimamo broj jedinstvenih vrednosti
# koje se nalaze unutar te kolone. Kao kriterijum uzimamo broj 20
kategoricke_kolone = []
for kolona in data.columns:
    if len(data[kolona].unique()) <= 20:
        kategoricke_kolone.append(kolona)


In [18]:
kategoricke_kolone

['Attrition',
 'BusinessTravel',
 'Department',
 'Education',
 'EducationField',
 'EnvironmentSatisfaction',
 'Gender',
 'JobInvolvement',
 'JobLevel',
 'JobRole',
 'JobSatisfaction',
 'MaritalStatus',
 'NumCompaniesWorked',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [19]:
numericke_kolone= [kolona for kolona in data.columns if kolona not in kategoricke_kolone]

In [20]:
numericke_kolone

['Age',
 'DailyRate',
 'DistanceFromHome',
 'HourlyRate',
 'MonthlyIncome',
 'MonthlyRate',
 'TotalWorkingYears',
 'YearsAtCompany',
 'PromotionPotential',
 'ExperienceRatio']

In [21]:
z_scores = np.abs(data[numericke_kolone] - data[numericke_kolone].mean()) / data[numericke_kolone].std()


In [22]:
z_scores

Unnamed: 0,Age,DailyRate,DistanceFromHome,HourlyRate,MonthlyIncome,MonthlyRate,TotalWorkingYears,YearsAtCompany,PromotionPotential,ExperienceRatio
0,0.446199,0.742274,1.010565,1.382668,0.108313,0.725773,0.421499,0.164557,0.877855,0.818879
1,1.321915,1.297333,0.147100,0.240595,0.291619,1.488370,0.164455,0.488342,0.540352,0.199857
2,0.008340,1.413882,0.887213,1.284288,0.937335,1.674271,0.550021,1.143905,0.877855,0.791346
3,0.429518,1.460969,0.763861,0.486544,0.763374,1.242788,0.421499,0.161892,0.359654,0.047944
4,1.086306,0.524116,0.887213,1.273580,0.644639,0.325789,0.678543,0.817456,1.597162,0.890466
...,...,...,...,...,...,...,...,...,...,...
1465,0.101124,0.202014,1.703184,1.224390,0.835167,0.284232,0.735197,0.327782,0.877855,0.196624
1466,0.227269,0.469595,0.393804,1.175201,0.740888,1.003668,0.292977,0.001332,0.413789,0.593105
1467,1.086306,1.604637,0.640509,1.038339,0.076664,1.283981,0.678543,0.164557,0.877855,0.295744
1468,1.321915,0.546491,0.887213,0.142215,0.236394,0.150342,0.735197,0.325117,0.877855,0.365057


In [23]:
data.shape

(1470, 33)

In [24]:
data = data[(z_scores < 3).all(axis=1)] 

In [25]:
data.shape

(1414, 33)

In [26]:
X = data.drop('Attrition', axis=1)
y = data['Attrition']

In [27]:
kategoricke_kolone = []
for kolona in X.columns:
    if len(X[kolona].unique()) <= 20:
        kategoricke_kolone.append(kolona)

In [28]:
numericke_kolone= [kolona for kolona in X.columns if kolona not in kategoricke_kolone]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=38)

In [28]:
kategoricke_kolone = []
for kolona in X.columns:
    if len(X[kolona].unique()) <= 20:
        kategoricke_kolone.append(kolona)

In [29]:
numericke_kolone = []
for kolona in X.columns:
    if kolona not in kategoricke_kolone:
        numericke_kolone.append(kolona)
        

In [31]:
numericka_transformacija=StandardScaler()
kategoricka_transformacija = OneHotEncoder(drop='first', sparse=False)


In [33]:
X_numeric = numericka_transformacija.fit_transform(X[numericke_kolone])
X_categorical = kategoricka_transformacija.fit_transform(X[kategoricke_kolone])

In [34]:
transformisane_kategoricke_kolone = kategoricka_transformacija.get_feature_names_out(input_features=kategoricke_kolone)


In [35]:
transformisane_kategoricke_kolone = list(transformisane_kategoricke_kolone)

In [36]:
transformisane_kategoricke_kolone

['BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'Department_Research & Development',
 'Department_Sales',
 'Education_2',
 'Education_3',
 'Education_4',
 'Education_5',
 'EducationField_Life Sciences',
 'EducationField_Marketing',
 'EducationField_Medical',
 'EducationField_Other',
 'EducationField_Technical Degree',
 'EnvironmentSatisfaction_2',
 'EnvironmentSatisfaction_3',
 'EnvironmentSatisfaction_4',
 'Gender_Male',
 'JobInvolvement_2',
 'JobInvolvement_3',
 'JobInvolvement_4',
 'JobLevel_2',
 'JobLevel_3',
 'JobLevel_4',
 'JobLevel_5',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction_2',
 'JobSatisfaction_3',
 'JobSatisfaction_4',
 'MaritalStatus_Married',
 'MaritalStatus_Single',
 'NumCompaniesWorked_1',
 'NumCompaniesWorked_2',
 'NumCompaniesWorked_3

In [38]:
len(transformisane_kategoricke_kolone)

127

In [42]:
X_numeric

array([[ 0.5095667 ,  0.73413029, -1.01361443, ..., -0.07414226,
        -0.87112411, -0.99477462],
       [ 1.4078398 , -1.31028803, -0.14849797, ...,  0.72158571,
        -0.53400952,  0.45190583],
       [ 0.06043015,  1.40732272, -0.89002637, ..., -1.26773422,
        -0.87112411, -0.95567515],
       ...,
       [-1.06241123, -1.61831703, -0.64285023, ..., -0.07414226,
        -0.87112411, -0.25188466],
       [ 1.4078398 ,  0.53788601, -0.89002637, ...,  0.52265372,
        -0.87112411,  0.68650266],
       [-0.27642227, -0.44333542, -0.14849797, ..., -0.47200625,
        -0.129472  , -0.60377991]])

In [43]:
X_categorical

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.]])

In [44]:
X_transformed = np.concatenate((X_numeric, X_categorical), axis=1)


In [45]:
X_transformed

array([[ 0.5095667 ,  0.73413029, -1.01361443, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.4078398 , -1.31028803, -0.14849797, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06043015,  1.40732272, -0.89002637, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.06241123, -1.61831703, -0.64285023, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.4078398 ,  0.53788601, -0.89002637, ...,  0.        ,
         0.        ,  0.        ],
       [-0.27642227, -0.44333542, -0.14849797, ...,  0.        ,
         0.        ,  0.        ]])

In [47]:
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X_transformed, y)

In [48]:
X_resampled = pd.DataFrame(X_resampled, columns=numericke_kolone + transformisane_kategoricke_kolone)


In [49]:
X_resampled

Unnamed: 0,Age,DailyRate,DistanceFromHome,HourlyRate,MonthlyIncome,MonthlyRate,TotalWorkingYears,YearsAtCompany,PromotionPotential,ExperienceRatio,...,YearsWithCurrManager_8,YearsWithCurrManager_9,YearsWithCurrManager_10,YearsWithCurrManager_11,YearsWithCurrManager_12,YearsWithCurrManager_13,YearsWithCurrManager_14,YearsWithCurrManager_15,YearsWithCurrManager_16,YearsWithCurrManager_17
0,0.509567,0.734130,-1.013614,1.380322,-0.037751,0.735689,-0.371748,-0.074142,-0.871124,-0.994775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.407840,-1.310288,-0.148498,-0.244393,-0.234418,1.502825,-0.083246,0.721586,-0.534010,0.451906,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.060430,1.407323,-0.890026,1.281855,-0.927196,-1.678641,-0.516000,-1.267734,-0.871124,-0.955675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.388706,1.454521,-0.766438,-0.490562,-0.740556,1.255781,-0.371748,0.323722,0.364963,0.100011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.062411,-0.535247,-0.890026,-1.278303,-0.613167,0.333325,-0.660251,-0.869870,1.601050,-1.096433,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359,2.418397,-0.510406,-0.272086,-0.638264,0.877217,-0.318063,2.946034,0.721586,0.814449,0.056024,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2360,-1.399264,-1.407168,1.581735,-0.392095,-1.049115,-0.379965,-1.381508,-1.068802,-0.871124,-1.131623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2361,-0.613275,-0.130338,2.076087,-0.145926,-0.123437,0.961251,-0.083246,0.721586,-0.871124,0.451906,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2362,-0.276422,-1.268058,-0.395674,-1.622940,-0.867717,-0.285557,-1.093005,-0.869870,0.364963,-0.251885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
y_resampled

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
2359    Yes
2360    Yes
2361    Yes
2362    Yes
2363    Yes
Name: Attrition, Length: 2364, dtype: object

In [52]:
y_resampled = y_resampled.replace({'Yes': 1, 'No': 0})


In [54]:
y_resampled =pd.DataFrame(y_resampled, columns=['Attrition'])

In [57]:
y_resampled

Unnamed: 0,Attrition
0,1
1,0
2,1
3,0
4,0
...,...
2359,1
2360,1
2361,1
2362,1


In [None]:
X_test, y_test = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=36)

##### Model drva odlučivanja (Decision Tree)

###### Definisanje modela koji će se ispitivati 

In [59]:
modeli = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier()
}

In [60]:
from imblearn.pipeline import 

In [61]:
rezultati = {}

In [63]:
y

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1414, dtype: object

In [64]:
y = y.replace({'Yes': 1, 'No': 0})

In [65]:
y=pd.DataFrame(y,columns=['Attrition'])

In [67]:
X_train_o,X_test_o,y_train_o, y_test_o = train_test_split(X_transformed,y,test_size=0.2,random_state=36)

In [71]:
for ime_modela, model in modeli.items():
   
    pipeline = Pipeline(steps=[('model', model)])
    pipeline.fit(X_train_o, y_train_o)
   
    y_pred = pipeline.predict(X_test_o)
    
    accuracy = accuracy_score(y_test_o, y_pred)
    precision = precision_score(y_test_o, y_pred, pos_label=1)
    recall = recall_score(y_test_o, y_pred, pos_label=1)
    f1 = f1_score(y_test_o, y_pred, pos_label=1)
    
    rezultati[ime_modela] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        #'Confusion Matrix': confusion_matrix(y_test, y_pred)
        
    }

print(rezultati)

{'Logistic Regression': {'Accuracy': 0.8798586572438163, 'Precision': 0.7027027027027027, 'Recall': 0.5306122448979592, 'F1-Score': 0.6046511627906977}, 'Random Forest': {'Accuracy': 0.8374558303886925, 'Precision': 0.6153846153846154, 'Recall': 0.16326530612244897, 'F1-Score': 0.25806451612903225}, 'XGBoost': {'Accuracy': 0.8551236749116607, 'Precision': 0.6333333333333333, 'Recall': 0.3877551020408163, 'F1-Score': 0.4810126582278481}, 'SVM': {'Accuracy': 0.8515901060070671, 'Precision': 0.8181818181818182, 'Recall': 0.1836734693877551, 'F1-Score': 0.3}, 'K-Nearest Neighbors': {'Accuracy': 0.8021201413427562, 'Precision': 0.3157894736842105, 'Recall': 0.12244897959183673, 'F1-Score': 0.1764705882352941}, 'Decision Tree': {'Accuracy': 0.784452296819788, 'Precision': 0.38, 'Recall': 0.3877551020408163, 'F1-Score': 0.3838383838383838}, 'Naive Bayes': {'Accuracy': 0.31095406360424027, 'Precision': 0.19834710743801653, 'Recall': 0.9795918367346939, 'F1-Score': 0.32989690721649484}, 'Neural

In [68]:
for ime_modela, model in modeli.items():
   
    pipeline = Pipeline(steps=[('model', model)])
    pipeline.fit(X_train, y_train)
   
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=1)
    recall = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    
    rezultati[ime_modela] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': confusion_matrix(y_test, y_pred)
        
    }

print(rezultati)

{'Logistic Regression': {'Accuracy': 0.8329809725158562, 'Precision': 0.8249027237354085, 'Recall': 0.8617886178861789, 'F1-Score': 0.8429423459244533, 'Confusion Matrix': array([[182,  45],
       [ 34, 212]], dtype=int64)}, 'Random Forest': {'Accuracy': 0.9894291754756871, 'Precision': 0.9800796812749004, 'Recall': 1.0, 'F1-Score': 0.9899396378269618, 'Confusion Matrix': array([[222,   5],
       [  0, 246]], dtype=int64)}, 'XGBoost': {'Accuracy': 0.9556025369978859, 'Precision': 0.9213483146067416, 'Recall': 1.0, 'F1-Score': 0.9590643274853802, 'Confusion Matrix': array([[206,  21],
       [  0, 246]], dtype=int64)}, 'SVM': {'Accuracy': 0.9492600422832981, 'Precision': 0.944, 'Recall': 0.959349593495935, 'F1-Score': 0.9516129032258064, 'Confusion Matrix': array([[213,  14],
       [ 10, 236]], dtype=int64)}, 'K-Nearest Neighbors': {'Accuracy': 0.7315010570824524, 'Precision': 0.6765578635014837, 'Recall': 0.926829268292683, 'F1-Score': 0.7821612349914236, 'Confusion Matrix': array([