# Model Development and Evaluation
### Alessandro Rubin

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import mean_squared_error

## Data Acquisition

In [2]:
#import the dataframe
df = pd.read_csv("uci_adult.csv", sep = "|")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educ-num,marital-stat,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


<div class="alert alert-block alert-info" style="margin-top: 20px">
<h2> Preprocessing </h2>
</div>

### Null valure Treatment

NO NON VA BENE: DEVO SOSTITUIRE LA MEDIA O LA MODA DEL TRAINING SET E NON DI TUTTO IL SET!

In the previous analysis we have changed the missing values with proper NaN values. At this point we will fill them with the mode of the columnn. Remember that we have missing values only in three columns: workclass, occupation and native-country.

In [5]:
def null_treatment(df):
    null_columns = ["workclass", "occupation", "native-country"]
    for col in null_columns:
        df[col].fillna(df[col].mode().values[0],inplace=True)  #strategy media, moda o mediana!
        return df

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      32561 non-null  int64 
 1   age             32561 non-null  int64 
 2   workclass       32561 non-null  object
 3   fnlwgt          32561 non-null  int64 
 4   education       32561 non-null  object
 5   educ-num        32561 non-null  int64 
 6   marital-stat    32561 non-null  object
 7   occupation      32561 non-null  object
 8   relationship    32561 non-null  object
 9   race            32561 non-null  object
 10  sex             32561 non-null  object
 11  capital-gain    32561 non-null  int64 
 12  capital-loss    32561 non-null  int64 
 13  hours-per-week  32561 non-null  int64 
 14  native-country  32561 non-null  object
 15  income          32561 non-null  object
dtypes: int64(7), object(9)
memory usage: 4.0+ MB


### Label Encoding

Label Encoding is performed to convert the categorical data into numeric format.

In [7]:
cat_columns = df.select_dtypes(include = "object").columns.to_list()
print(cat_columns)

['workclass', 'education', 'marital-stat', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


In [8]:
df["workclass"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' Self-emp-inc', ' Without-pay', ' Never-worked'],
      dtype=object)

### Scaling the data

### Outlier Treatment

### Trasformazione variabili categoriali


In [None]:
# Creazione dummy variable per la colonna "Workclass"
dv_1 = pd.get_dummies(df["workclass"])

# Creazione dummy variable per la colonna "Education"
dv_2 = pd.get_dummies(df["education"])

# Creazione dummy variable per la colonna "Marital Status"
dv_3 = pd.get_dummies(df["marital-stat"])

# Creazione dummy variable per la colonna "Occupation"
dv_4 = pd.get_dummies(df["occupation"])

# Creazione dummy variable per la colonna "Relationship"
dv_5 = pd.get_dummies(df["relationship"])

# Creazione dummy variable per la colonna "Race"
dv_6 = pd.get_dummies(df["race"])

# Creazione dummy variable per la colonna "Native Country"
dv_7 = pd.get_dummies(df["native-country"])

# Creazione dummy variable per la colonna "Native Country"
dv_7 = pd.get_dummies(df["native-country"])

#Essendo il sesso una variabile binaria, trasformiamola in 0 per i maschi e 1 per le ragazze. 
df['sex'].replace(to_replace=[' Male',' Female'], value=[0,1],inplace=True)

#Essendo la variabile da predirre una variabile binaria, trasformiamola in 0 per <=50k e 1 per >50k
df['income'].replace(to_replace=[' <=50K',' >50K'], value=[0,1],inplace=True)

In [None]:
# uniamo le dummy variables al dataframe
df = pd.concat([df, dv_1,dv_2,dv_3,dv_4,dv_5,dv_6,dv_7], axis=1)

# togliamo le colonne originali dal dataframe
df.drop(["workclass", "education", "marital-stat", "occupation","relationship", "race","native-country"], axis = 1, inplace=True)

In [None]:
df.head()

Adesso abbiamo il dataset pronto per l'analisi. 

<div class="alert alert-danger alertdanger" style="margin-top: 10px">
<h2 id="classifier"> 3. Il Classificatore</h2>

Visto che il dataset è piuttosto corposo, eviterei di usare un algoritmo ad albero. Sono quindi più propenso ad utilizzare un algoritmo efficiente come K-Nearest Neighbors. Iniziamo definendo le variabili indipendenti X e la variabile dipendente y. 

In [None]:
# X vettore variabili indipendenti - colonne considerate: tutte tranne income
features = [x for x in list(df.columns)  if x!="income" ]
X = df[features]
X.head()

In [None]:
# y vettore variabile dipendente 
y = df['income'].values
y

Per evitare l'overfitting sui dati, spezzo il dataset in 80% di training e 20% di testing. 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Normalizziamo adesso le entrate nella variabile indipendente. 

In [None]:
from sklearn import preprocessing

X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train)
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)


In [None]:
# Importiamo K-Nearest Neighbor e le librerie per la valutazione dell'accuratezza
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [None]:
# Faccio interare K-NN per tutti i K tra 1 e 15 e determino quello con l'accuratezza migliore
Ks = 15
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

Determiniamo ora la migliore approssimazione.

In [None]:
print( "La migliore accuratezza si ottiene per k =", mean_acc.argmax()+1, " e vale", mean_acc.max()) 

Ne possiamo anche fare un plot.

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuratezza ', '+/- 1xstd'))
plt.ylabel('Accuratezza ')
plt.xlabel('Numero di Neighbors (K)')
plt.tight_layout()
plt.show()

Scriviamo quindi esplicitamente il nostro classificatore. 

In [None]:
model = KNeighborsClassifier(n_neighbors = 11).fit(X_train,y_train)
yhat=neigh.predict(X_test)
yhat[0:10]