<h2 style="color: red;"><strong>Imports:</strong></h2>

In [148]:
import pandas as pd

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

<h2 style="color: red;"><strong>Data Analysis:</strong></h2>

<h4 style="color: gold;">Funzioni:</h4>

In [149]:
def stampaPercentDF(df):
    for name, group in df:
    # La funzione value_counts() ritorna la percentuale di apparizione per ogni elemento univoco,
    # è come un groupby però ci aggiunge le percentuali (se aggiungo il parametro normalize) altrimenti ritornebbe solo il numero di volte in cui i valori appaiono
        group = round(group['Severity'].value_counts(normalize=True) * 100, 2)
        print(name)
        print(group)
        print('-'*10)

<h4 style="color: gold;">DataFrames:</h4>

In [150]:
df_completo = pd.read_csv('CSVCompleto.csv')

<p style = "color: white;">Elimino tutti i valori Nan</p>

In [151]:
df_completo = df_completo[~(pd.isna(df_completo.iloc[:,0]))]

<p style = "color: white;">Drop di colonne / righe non utili allo studio dei dati o che rendono più complesso lo studio</p>

In [152]:
df_completo = df_completo.drop(df_completo[df_completo['Type'] == 'Moving average'].index, axis=0)
df_completo = df_completo.drop(['Series_reference', 'Validation', 'Indicator', 'Data_value', 'Lower_CI', 'Upper_CI', 'Type'], axis=1)

<p style = "color: white;">Raggruppo e faccio il mapping di dati, dato che scikit-learn non ammette stringhe nei suoi modelli</p>

In [153]:
units_map = {
    'Injuries': 1,
    'Per 100,000 FTEs': 2,
    'Per 100,000 people': 3,
    'Per billion km': 4,
    'Per thousand registered vehicles': 5
}
df_completo['Units'] = df_completo['Units'].map(units_map)

# Abbiamo tirato via Indicator perchè le percentuali erano molto simili alla colonna Units e un analisi più approfondita ha rivelato che le stesse unità avevano per la maggior parte gli stessi indicatori

pop_map = {
    'Maori': 1,
    'Whole pop': 2,
    'Children': 3
}
df_completo['Population'] = df_completo['Population'].map(pop_map)

cause_map = {
    'All': 1,
    'Assault': 2,
    'Drowing': 3,
    'Falls': 4,
    'Intentional self-harm': 5,
    'Motor vehicle traffic crashes': 6,
    'Work': 7,
    'Car occupant': 8,
    'Intentional': 9,
    'Pedestrian': 10
}
df_completo['Cause'] = df_completo['Cause'].map(cause_map)

age_map = {
    '0-14 years': 1,
    '0-74 years': 2,
    '75+ years': 3,
    'All ages':4
}
df_completo['Age'] = df_completo['Age'].map(age_map)

severity_map = {
    'Fatal': 1,
    'Serious non-fatal': 2,
    'Serious': 3
}
df_completo['Severity'] = df_completo['Severity'].map(severity_map)

print(df_completo)

     Period  Units  Cause  Population  Age  Severity
0      2017      1      4           1    2         3
1      2018      1      4           1    2         3
2      2000      3      4           1    2         3
3      2001      3      4           1    2         3
4      2002      3      4           1    2         3
...     ...    ...    ...         ...  ...       ...
2743   2012      1      4           1    2         3
2744   2013      1      4           1    2         3
2745   2014      1      4           1    2         3
2746   2015      1      4           1    2         3
2747   2016      1      4           1    2         3

[2298 rows x 6 columns]


<h2 style="color: Red;"> Machine Learning</h2>

<p style="color: white;">Dividiamo il file csv nei file di train e di test, il file di test ha il 30% del totale dei dati, mentre il file di train ha il 70% restante</p>

In [154]:
# suddivide il dataframe in 70/30
df_test, df_train = train_test_split(df_completo, test_size=0.3, random_state=42)

# salva i dati in due file CSV distinti
df_train.to_csv('medical-test.csv', index=False)
df_test.to_csv('medical-train.csv', index=False)

<p style = "color: white">Creo le variabili di test / train utili per la predizione dei dati nei vari modelli</p>

In [155]:
Y_test = df_test["Severity"]
Y_train = df_train["Severity"]
X_train = df_train.drop("Severity", axis=1)
X_test  = df_test.drop("Severity", axis=1,errors='ignore').copy()
X_train.shape, Y_train.shape, X_test.shape

((690, 5), (690,), (1608, 5))

<h5 style="color: pink;">Logistic Regression</h5>

In [156]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
print(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
print(acc_log)
acc_log = round(logreg.score(X_test, Y_test) * 100, 2)

acc_log

     Period  Units  Cause  Population  Age
1341   2014      1      6           2    4
352    2015      3      4           2    2
91     2009      3      1           3    1
2317   2001      3      1           1    4
2285   2009      2      7           2    4
...     ...    ...    ...         ...  ...
2088   2000      3      4           2    4
1545   2001      4      6           2    4
1580   2018      5      6           2    4
1744   2020      3      2           2    4
1310   2002      3      3           2    4

[1608 rows x 5 columns]
45.22


47.39

<h5 style="color: pink;">Support Vector Machines</h5>

In [157]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

35.8

<h5 style="color: pink;">KNN</h5>

In [158]:
# KNN

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

62.9

<h5 style="color: pink;">Gaussian Naive Bayes</h5>

In [159]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

46.96

<h5 style="color: pink;">Perceptron</h5>

In [160]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

32.61

<h5 style="color: pink;">Linear SVC</h5>

In [161]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc



31.59

<h5 style="color: pink;">Stochastic Gradient Descent</h5>

In [162]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

35.8

<h5 style="color: pink;">Decision Tree</h5>

In [163]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

79.71

<h5 style="color: pink;">Random Forest</h5>

In [168]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

79.71

<h3 style="color: gold;">Risultati:</h3>

<p style="color: white;">Ora stampo le varie percentuali di accuratezza per vedere quale modello si adatta meglio allo studio che voglio fare, ordinandoli in ordine decresente per percentuale</p>

In [165]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,79.71
8,Decision Tree,79.71
1,KNN,62.9
2,Logistic Regression,47.39
4,Naive Bayes,46.96
0,Support Vector Machines,35.8
6,Stochastic Gradient Decent,35.8
5,Perceptron,32.61
7,Linear SVC,31.59


<p style ="color: white;">Dalla tabella che ne risulta abbiamo che Random Forest e Decion Tree sono i modelli più accurati nel predirre la gravità delle condizioni del paziente.</p>