In [15]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate, cross_val_score

In [16]:
class Judge: #definisco la classe che mi permette di definire quale algoritmo performi meglio sul dataset
    
    def __init__(self):
        self.params = {}
        return None
    
    def set_algorithms(self, algorithms):
        self.algorithms = algorithms
        return None
    
    def set_data(self, X, y):
        self.X = X
        self.y = y
        return None
    
    def set_metrics(self, metrics):
        self.metrics = metrics
        return None
    
    def set_params(self, params):
        self.params = params
        return None
    
    def get_performance(self, metric, algorithm, grid):
        best_algorithm = algorithm
        if bool(grid):
            best_algorithm = GridSearchCV(estimator = algorithm, param_grid = grid)  # inner loop
        scores = cross_validate(best_algorithm, X = self.X, y = self.y, scoring = metric, cv = 10)  # outer_loop
        score = np.mean(scores['test_score'])
        score = round(score, 2)
        
        return score
        
    def get_table(self):
        metrics_results = {}
        for metric in self.metrics:
            algorithms_results = {}
            for label, algorithm in self.algorithms.items():
                grid = {}
                if label in self.params.keys():
                    grid = self.params[label]
                algorithms_results[label] = self.get_performance(metric, algorithm, grid)
            metrics_results[metric] = algorithms_results
            
        df = pd.DataFrame.from_dict(metrics_results)
                
        return df
    

# One-hot encoding - definisco una funzione che vada a splittare i campi categorici per permettere all'algoritmo di interpretarli come numeri
def ohe(data, column):
    d = pd.get_dummies(data[column], prefix = column)
    data_d = pd.concat([data, d], axis = 1)
    data_d = data_d.drop([column], axis = 1)
    return data_d

#definizione campi nulli
def get_null_count(data):
    return data.isnull().sum()

In [18]:
df = pd.read_csv('train.csv')
df.shape #controllo sulla dimensione del data set

(4250, 20)

In [19]:
get_null_count(df) #richiamo la funzione che mi conteggia la numerosità dei nulli

state                            0
account_length                   0
area_code                        0
international_plan               0
voice_mail_plan                  0
number_vmail_messages            0
total_day_minutes                0
total_day_calls                  0
total_day_charge                 0
total_eve_minutes                0
total_eve_calls                  0
total_eve_charge                 0
total_night_minutes              0
total_night_calls                0
total_night_charge               0
total_intl_minutes               0
total_intl_calls                 0
total_intl_charge                0
number_customer_service_calls    0
churn                            0
dtype: int64

In [20]:
df #guardo come sono i dati

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,area_code_415,no,no,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,no
4246,WV,73,area_code_408,no,no,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,no
4247,NC,75,area_code_408,no,no,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,no
4248,HI,50,area_code_408,no,yes,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,no


In [21]:
print(df['churn'].value_counts)

<bound method IndexOpsMixin.value_counts of 0       no
1       no
2       no
3       no
4       no
        ..
4245    no
4246    no
4247    no
4248    no
4249    no
Name: churn, Length: 4250, dtype: object>


In [22]:
#per poter interpretare il campo Churn devo trasformarlo in un booleano,essendo un campo binario, questo è gestibile molto bene.

df['churn'] = df['churn'].replace('no', 0)
df['churn'] = df['churn'].replace('yes', 1)

#Faccio lo stesso tipo di trasformazione booleana per i campi international_plan,voice_mail_plan
df['international_plan'] = df['international_plan'].replace('no', 0)
df['international_plan'] = df['international_plan'].replace('yes', 1)

df['voice_mail_plan'] = df['voice_mail_plan'].replace('no', 0)
df['voice_mail_plan'] = df['voice_mail_plan'].replace('yes', 1)

In [24]:
#check
df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,0
1,NJ,137,area_code_415,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,0
2,OH,84,area_code_408,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
3,OK,75,area_code_415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
4,MA,121,area_code_510,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,area_code_415,0,0,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,0
4246,WV,73,area_code_408,0,0,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,0
4247,NC,75,area_code_408,0,0,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,0
4248,HI,50,area_code_408,0,1,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,0


In [29]:
#trasformo campo area_code in un numero rimuovendo la prima parte della stringa che è ininfluente essendo ripetuta su tutta la colonna
df['area_code'] = df['area_code'].str.replace('area_code_', '')
df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,0
1,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,0
2,OH,84,408,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
3,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
4,MA,121,510,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,415,0,0,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,0
4246,WV,73,408,0,0,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,0
4247,NC,75,408,0,0,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,0
4248,HI,50,408,0,1,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,0


In [37]:
#definisco X e Y
X = df.drop (['churn'],axis = 1) #tutto il dataset tranne la colonna churn
y = df[('churn')]
df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,0
1,NJ,137,415,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,0
2,OH,84,408,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
3,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
4,MA,121,510,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,415,0,0,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,0
4246,WV,73,408,0,0,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,0
4247,NC,75,408,0,0,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,0
4248,HI,50,408,0,1,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,0


In [40]:
#applico ohe al campo state che è categorico e non voglio rimuoverlo dal df
state_first = X['state'].astype(str).str[0]
state_first_df = pd.DataFrame({'state_first': state_first})
state_first_ohe = ohe(state_first_df, 'state_first')

state_first_ohe

Unnamed: 0,state_first_A,state_first_C,state_first_D,state_first_F,state_first_G,state_first_H,state_first_I,state_first_K,state_first_L,state_first_M,state_first_N,state_first_O,state_first_P,state_first_R,state_first_S,state_first_T,state_first_U,state_first_V,state_first_W
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4247,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4248,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
#Sistemo il mio dataframe droppando la colonna state e concatenando l'ohe appena computato
X = pd.concat([X, state_first_ohe], axis = 1)
X = X.drop(['state'], axis = 1)
X

Unnamed: 0,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,...,state_first_M,state_first_N,state_first_O,state_first_P,state_first_R,state_first_S,state_first_T,state_first_U,state_first_V,state_first_W
0,107,415,0,1,26,161.6,123,27.47,195.5,103,...,0,0,1,0,0,0,0,0,0,0
1,137,415,0,0,0,243.4,114,41.38,121.2,110,...,0,1,0,0,0,0,0,0,0,0
2,84,408,1,0,0,299.4,71,50.90,61.9,88,...,0,0,1,0,0,0,0,0,0,0
3,75,415,1,0,0,166.7,113,28.34,148.3,122,...,0,0,1,0,0,0,0,0,0,0
4,121,510,0,1,24,218.2,88,37.09,348.5,108,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,83,415,0,0,0,188.3,70,32.01,243.8,88,...,1,0,0,0,0,0,0,0,0,0
4246,73,408,0,0,0,177.9,89,30.24,131.2,82,...,0,0,0,0,0,0,0,0,0,1
4247,75,408,0,0,0,170.7,101,29.02,193.1,126,...,0,1,0,0,0,0,0,0,0,0
4248,50,408,0,1,40,235.7,127,40.07,223.0,126,...,0,0,0,0,0,0,0,0,0,0


In [43]:
#model evaluation and selection (solo AUC senza iperparametri)
algorithms = {
    'knn': KNeighborsClassifier(),
    'lr': LogisticRegression(solver ='liblinear'),
    'dt': DecisionTreeClassifier()
}

metrics = ['roc_auc']

params = {}

In [44]:
j = Judge()
j.set_algorithms(algorithms)
j.set_data(X, y)   
j.set_metrics(metrics)
j.set_params(params)
j.get_table()

Unnamed: 0,roc_auc
dt,0.84
knn,0.69
lr,0.82


In [45]:
algorithm = LogisticRegression(solver = 'liblinear')
clf = GridSearchCV(algorithm, param_grid = {})
clf.fit(X, y)
clf.best_estimator_

LogisticRegression(solver='liblinear')