In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix

#Caricamento dataset
dataset_path = './WA_Fn-UseC_-Telco-Customer-Churn.train.csv'
dataset = pd.read_csv(dataset_path)

dataset['Churn'] = dataset['Churn'].map({'Yes':1, 'No':0})
# 'No internet service' and 'No phone service' mean that they do not have the service so it is equivalent to 'No'
# Should not be missing data
dataset['MultipleLines'] = dataset['MultipleLines'].replace({'No phone service' : 'No'})
for i in [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies']: 
    dataset[i]  = dataset[i].replace({'No internet service' : 'No'})
#Removing null values/ missing data
#dataset['TotalCharges'] = dataset["TotalCharges"].replace(" ",np.NaN)
#dataset = dataset.dropna(axis=0)
#only 10, does not change much (worse results)

# customerID is a value unique for each row, so it is not useful as a feature
features = ['gender', 'SeniorCitizen', 'Partner',
       'Dependents', 'tenure', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges']

dataset.index = dataset['customerID']
dataset = dataset.drop(columns='customerID')

# Encoding strings of the data set into labels
le = preprocessing.LabelEncoder()
for column in features:
    dataset[column] = le.fit_transform(dataset[column])
    
#Separate features from y
y = dataset['Churn']
X = dataset[features]

# Create training set and validation set 
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0, test_size=0.33)

# More processing
sc = StandardScaler()
train_X = pd.DataFrame(sc.fit_transform(train_X))
val_X = pd.DataFrame(sc.fit_transform(val_X))

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(n_jobs=-1)
model.fit(train_X, train_y)

predicted = model.predict(val_X)
mae = mean_absolute_error(val_y,predicted)
train_accuracy = model.score(train_X,train_y)*100
test_accuracy = model.score(val_X,val_y)*100

print('Mean absolute error: ')
print(mae)
print('Training set accuracy: ')
print(train_accuracy)
print('Test set accuracy: ')
print(test_accuracy)
# max ~0.19 error with ~79-~80 -> best so far

In [None]:
plot_confusion_matrix(model,val_X,val_y)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=0, max_depth=3)
model.fit(train_X,train_y)

predicted = model.predict(val_X)
mae = mean_absolute_error(val_y,predicted)
train_accuracy = model.score(train_X,train_y)*100
test_accuracy = model.score(val_X,val_y)*100

print(predicted)
print(mae)
print(train_accuracy)
print(test_accuracy)

In [None]:
#from sklearn.tree import DecisionTreeClassifier
#best=[0,0,0,0]
#for i in range(1,10):
#    model = DecisionTreeClassifier(random_state=0, max_depth=i)
#    model.fit(train_X,train_y)
#    predicted = model.predict(val_X)
#    print(i)
#    mae = mean_absolute_error(val_y,predicted)
#    print(predicted)
#    train_accuracy = model.score(train_X,train_y)*100
#    test_accuracy = model.score(val_X,val_y)*100
#    print(mae)
#    print(train_accuracy)
#    print(test_accuracy)
#    if test_accuracy > best[2]:
#        best[0] = i
#        best[1] = train_accuracy
#        best[2] = test_accuracy
#        best[3] = mae
#print(best)
#Best is ~0.21 with ~78-~78 and max_depth=3

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=54,n_jobs=-1)
model.fit(train_X, train_y)

predicted = model.predict(val_X)
mae = mean_absolute_error(val_y,predicted)

train_accuracy = model.score(train_X,train_y)*100
test_accuracy = model.score(val_X,val_y)*100

print(predicted)
print(mae)
print(train_accuracy)
print(test_accuracy)

In [None]:
#from sklearn.neighbors import KNeighborsClassifier
#results=[]
#best=[0,0,0,0]
#for i in range(1,100):
#    model = KNeighborsClassifier(n_neighbors=i,n_jobs=-1)
#    model.fit(train_X, train_y)
#    predicted = model.predict(val_X)
#    print(i)
#    mae = mean_absolute_error(val_y,predicted)
#    print(predicted)
#    train_accuracy = model.score(train_X,train_y)*100
#    test_accuracy = model.score(val_X,val_y)*100
#    print(mae)
#    print(train_accuracy)
#    print(test_accuracy)
#    results.append(mae) 
#    if test_accuracy > best[2]:
#        best[0] = i
#        best[1] = train_accuracy
#        best[2] = test_accuracy
#        best[3] = mae
#from statistics import mean
#mean(results)
#print(best)
# Best is ~78-~79 with ~0.20 error 

In [None]:
#trying to check if removing some features gets better results, but is this leaking?
# Best is 78-81 removing StreamingMovies, InternetService, gender, PaperlessBilling, SeniorCitizen
# After removing null values in TotalCharges, 80-80 removing PaperlessBilling
from sklearn.linear_model import LogisticRegression
changed_best = True
best=[0,0,0]
deleted_features = []
train_X_copy = train_X
val_X_copy = val_X

while changed_best:
    
    last_removed = -1
    changed_best = False
    
    for i in train_X_copy.columns:
        
        tr_X = train_X_copy.drop(columns=i)
        v_X = val_X_copy.drop(columns=i)
        
        model = LogisticRegression(n_jobs=-1)
        model.fit(tr_X, train_y)
        
        predicted = model.predict(v_X)
        mae = mean_absolute_error(val_y,predicted)
        train_accuracy = model.score(tr_X,train_y)*100
        test_accuracy = model.score(v_X,val_y)*100
        
        if test_accuracy > best[1]:
            best[0] = train_accuracy
            best[1] = test_accuracy
            best[2] = mae
            changed_best = True
            last_removed = i
            
    if changed_best:
        train_X_copy = train_X_copy.drop(columns=last_removed)
        val_X_copy = val_X_copy.drop(columns=last_removed)
        deleted_features.append(last_removed)
        
    print(best)
    print(deleted_features)
    for i in deleted_features:
        print(features[i-1])
        
print('Done')

In [None]:
#Validation functions
#predicted = model.predict(val_X)
#mae = mean_absolute_error(val_y,predicted)
#train_accuracy = model.score(train_X,train_y)*100
#test_accuracy = model.score(val_X,val_y)*100
#print('Mean absolute error: ')
#print(mae)
#print('Training set accuracy: ')
#print(train_accuracy)
#print('Test set accuracy: ')
#print(test_accuracy)