In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
# Random seed for reproducibility
seed = 202
np.random.seed(seed)

In [3]:
# Import and plot the dataset
train_data = pd.read_csv("trainset.csv", delimiter=",")
train_data.head()

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DataAllowanceOneShot,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,CustomerAge,EstimatedDevicePrice,...,Music-Streaming,Network-Operation,P2P-Applications,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,Product
0,10930,1.0,0.156221,0.010514,,1.0,0.018229,0.001623,"(40, 50]",0.445783,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Non-Customer
1,10170,0.0,0.155086,0.054729,,1.0,0.051432,0.001498,"(20, 30]",,...,0.081846,0.000317,0.00706,0.00262,0.071551,0.0,0.001896,5.8e-05,0.77541,Non-Customer
2,1492,1.0,,,,1.0,0.058594,0.001248,"(30, 40]",0.722892,...,0.03386,1e-06,0.0,0.000228,0.099438,0.0,0.012055,0.003579,0.499158,Non-Customer
3,7424,1.0,0.155086,0.004987,,1.0,0.097005,0.000499,"(50, 60]",0.39759,...,0.0,0.0,1e-06,5.2e-05,0.041474,0.0,0.051692,0.013263,0.301528,V-Bag
4,4332,1.0,0.155086,0.038148,,1.0,0.034505,0.000375,"(60, 70]",0.156627,...,0.0,0.000106,1e-06,0.000777,0.005975,0.0,0.000857,0.019149,0.926208,Non-Customer


In [4]:
# Plot the columns name
train_data.columns

Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DataAllowanceOneShot', 'DeviceFlagSmartphone',
       'MonthlyVoiceTrafficCount', 'MonthlySMSTrafficCount', 'CustomerAge',
       'EstimatedDevicePrice', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'Province', 'Region', 'ZipCode', 'ChurnScore',
       'AirportConnectionsDuration', 'AirportConnectionsCount',
       'StationConnectionsDuration', 'StationConnectionsCount',
       'ParkingConnectionsDuration', 'ParkingConnectionsCount',
       'File-Transfer', 'Games', 'Instant-Messaging-Applications', 'Mail',
       'Music-Streaming', 'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'Product'],
      dtype='object')

In [5]:
train_data['Province']

0       BOLZANO-BOZEN
1                 NaN
2                 NaN
3             BRESCIA
4             VENEZIA
5                 NaN
6             CATANIA
7              GENOVA
8              ANCONA
9              MILANO
10               ROMA
11                NaN
12               ROMA
13             MILANO
14             TERAMO
15              UDINE
16            CATANIA
17             SAVONA
18           CAGLIARI
19            TREVISO
20             AREZZO
21            PERUGIA
22             GENOVA
23           CAGLIARI
24          CATANZARO
25            MANTOVA
26             ANCONA
27             VERONA
28            VICENZA
29             NAPOLI
            ...      
9537           TERAMO
9538              NaN
9539           MODENA
9540           LATINA
9541           FOGGIA
9542              NaN
9543           MILANO
9544              NaN
9545          COSENZA
9546            TERNI
9547          TRAPANI
9548           MILANO
9549           PADOVA
9550            NUORO
9551      

In [6]:
train_data['CustomerAge'].unique()

array(['(40, 50]', '(20, 30]', '(30, 40]', '(50, 60]', '(60, 70]',
       '(70, 80]', nan, '(10, 20]', '(80, 90]'], dtype=object)

In [7]:
# Reconstruct the feature CustomerAge
# (40,50] -> 45
# Then transform it into a dataframe column with named CustomerAge
customer_age = []
# Too high to search
#nan = train_data['CustomerAge'][11]
"""for k in range(0, len(train_data)):
    if isinstance(train_data['CustomerAge'][k],float):
        customer_age.append(float('nan'))
    else:
        min_age = int(train_data['CustomerAge'][k][1])*10
        customer_age.append(min_age+5)
customer_age = np.asarray(customer_age)
customer_age = pd.DataFrame({'CustomerAge': customer_age})
customer_age.head()"""

customer_age = []
customer_age = train_data['CustomerAge']
#if isinstance(customer_age,str):
customer_age.replace({ 
    '(10, 20]' : float(15), 
    '(20, 30]' : float(25), 
    '(30, 40]' : float(35),
    '(40, 50]' : float(45),
    '(50, 60]' : float(55),
    '(60, 70]' : float(65),
    '(70, 80]' : float(75),
    '(80, 90]' : float(85)}, 
    inplace=True)
print("Replace Done!")

customer_age = np.asarray(customer_age)
customer_age = pd.DataFrame({'CustomerAge': customer_age})
customer_age.head()

Replace Done!


Unnamed: 0,CustomerAge
0,45.0
1,25.0
2,35.0
3,55.0
4,65.0


In [8]:
# Remove the previous CustomerAge and then insert the new CustomerAge
train_data = train_data.drop('CustomerAge', axis=1)
train_data = train_data.join(customer_age)
train_data

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DataAllowanceOneShot,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,EstimatedDevicePrice,MonthlyDataTraffic,...,Network-Operation,P2P-Applications,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,Product,CustomerAge
0,10930,1.0,0.156221,0.010514,,1.0,0.018229,0.001623,0.445783,0.011007,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Non-Customer,45.0
1,10170,0.0,0.155086,0.054729,,1.0,0.051432,0.001498,,0.039382,...,3.167665e-04,7.060349e-03,0.002620,0.071551,0.000000,0.001896,0.000058,0.775410,Non-Customer,25.0
2,1492,1.0,,,,1.0,0.058594,0.001248,0.722892,0.033031,...,1.424006e-06,0.000000e+00,0.000228,0.099438,0.000000,0.012055,0.003579,0.499158,Non-Customer,35.0
3,7424,1.0,0.155086,0.004987,,1.0,0.097005,0.000499,0.397590,0.002928,...,0.000000e+00,1.113481e-06,0.000052,0.041474,0.000000,0.051692,0.013263,0.301528,V-Bag,55.0
4,4332,1.0,0.155086,0.038148,,1.0,0.034505,0.000375,0.156627,0.001002,...,1.059584e-04,1.226690e-06,0.000777,0.005975,0.000000,0.000857,0.019149,0.926208,Non-Customer,65.0
5,4483,1.0,0.155086,0.054729,,1.0,0.001953,0.000499,,0.036664,...,0.000000e+00,1.011675e-05,0.000569,0.070697,0.000000,0.003048,0.001291,0.896900,Non-Customer,55.0
6,7586,1.0,0.035019,0.010514,,1.0,0.000000,0.000000,0.289157,0.000000,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Non-Customer,45.0
7,828,1.0,0.154708,0.010514,,1.0,0.092448,0.000000,0.385542,0.005903,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Non-Customer,25.0
8,12381,0.0,0.156087,0.024777,,1.0,0.030599,0.000000,,0.029139,...,1.088769e-03,3.242146e-03,0.009175,0.105768,0.000000,0.001331,0.000036,0.760074,V-Pet,25.0
9,6377,0.0,,0.010514,,1.0,0.047526,0.004494,0.132530,0.007696,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Non-Customer,75.0


c'è da convertire tutte le features per renderle leggibili per un modello di machine learning
nella fattispecie, bisogna convertire ogni colonna di stringhe in float/int
decidere cosa fare con i nan

In [9]:
# Find all the unique product in column Product
train_data.Product.unique()

array(['Non-Customer', 'V-Bag', 'V-Pet', 'V-Auto'], dtype=object)

In [10]:
# Replace all products str in integer
product = []
product = train_data.Product
product.replace({ 
    'Non-Customer' : float(0), 
    'V-Bag': float(1), 
    'V-Pet' : float(2), 
    'V-Auto' : float(3)},
    inplace=True)

product = np.asarray(product)
product = pd.DataFrame({'Product': product})
product.head()

Unnamed: 0,Product
0,0.0
1,0.0
2,0.0
3,1.0
4,0.0


In [11]:
# Remove the previous Product column and then add the new Product column
train_data = train_data.drop('Product', axis=1)
train_data = train_data.join(product)
train_data

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DataAllowanceOneShot,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,EstimatedDevicePrice,MonthlyDataTraffic,...,Network-Operation,P2P-Applications,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,CustomerAge,Product
0,10930,1.0,0.156221,0.010514,,1.0,0.018229,0.001623,0.445783,0.011007,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,45.0,0.0
1,10170,0.0,0.155086,0.054729,,1.0,0.051432,0.001498,,0.039382,...,3.167665e-04,7.060349e-03,0.002620,0.071551,0.000000,0.001896,0.000058,0.775410,25.0,0.0
2,1492,1.0,,,,1.0,0.058594,0.001248,0.722892,0.033031,...,1.424006e-06,0.000000e+00,0.000228,0.099438,0.000000,0.012055,0.003579,0.499158,35.0,0.0
3,7424,1.0,0.155086,0.004987,,1.0,0.097005,0.000499,0.397590,0.002928,...,0.000000e+00,1.113481e-06,0.000052,0.041474,0.000000,0.051692,0.013263,0.301528,55.0,1.0
4,4332,1.0,0.155086,0.038148,,1.0,0.034505,0.000375,0.156627,0.001002,...,1.059584e-04,1.226690e-06,0.000777,0.005975,0.000000,0.000857,0.019149,0.926208,65.0,0.0
5,4483,1.0,0.155086,0.054729,,1.0,0.001953,0.000499,,0.036664,...,0.000000e+00,1.011675e-05,0.000569,0.070697,0.000000,0.003048,0.001291,0.896900,55.0,0.0
6,7586,1.0,0.035019,0.010514,,1.0,0.000000,0.000000,0.289157,0.000000,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,45.0,0.0
7,828,1.0,0.154708,0.010514,,1.0,0.092448,0.000000,0.385542,0.005903,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,25.0,0.0
8,12381,0.0,0.156087,0.024777,,1.0,0.030599,0.000000,,0.029139,...,1.088769e-03,3.242146e-03,0.009175,0.105768,0.000000,0.001331,0.000036,0.760074,25.0,2.0
9,6377,0.0,,0.010514,,1.0,0.047526,0.004494,0.132530,0.007696,...,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,75.0,0.0


In [12]:
# ARPU stand for "Average Revenue Per Unit"
train_data.DataArpu.head()*100

0    15.622083
1    15.508605
2          NaN
3    15.508605
4    15.508605
Name: DataArpu, dtype: float64

In [13]:
games = []
games = train_data.Games
max(games*100)
#train_data.describe()

39.663454894390085

In [14]:
#The Churn Score predicted for each customer, 
#which is the predicted chance of the customer leaving the company in the upcoming month.
churn = train_data.ChurnScore
churn.head()*100

0    20.428851
1     5.654998
2     7.513020
3    19.160963
4    11.996382
Name: ChurnScore, dtype: float64

In [15]:
#Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
regions_sort = sorted(train_data['Region'].dropna().unique())
#regions_len = len(regions_sort) #20 Regions

mapping = {}
for val, idx in enumerate(regions_sort):
    mapping[idx] = val
    
regions = train_data['Region']
regions.replace(mapping, inplace=True)

#print(mapping, regions)
regions = np.asarray(regions)
regions = pd.DataFrame({'Region': regions})
regions.head()

Unnamed: 0,Region
0,16.0
1,
2,
3,8.0
4,19.0


In [16]:
#Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
provinces_sort = sorted(train_data['Province'].dropna().unique())
provinces_len = len(provinces_sort) #110 Provinces

mapping = {}
for val, idx in enumerate(provinces_sort):
    mapping[idx] = val
    
provinces = train_data['Province']
provinces.replace(mapping, inplace=True)


#print(provinces_sort, provinces_len)
#print(mapping, provinces)
provinces = np.asarray(provinces)
provinces = pd.DataFrame({'Province': provinces})
provinces.head()

Unnamed: 0,Province
0,14.0
1,
2,
3,15.0
4,103.0


In [17]:
# Remove the previous Region and Province columns and then insert the new Region and Province
train_data = train_data.drop('Region', axis=1)
train_data = train_data.drop('Province', axis=1)
train_data = train_data.join(regions)
train_data = train_data.join(provinces)
train_data

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DataAllowanceOneShot,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,EstimatedDevicePrice,MonthlyDataTraffic,...,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,CustomerAge,Product,Region,Province
0,10930,1.0,0.156221,0.010514,,1.0,0.018229,0.001623,0.445783,0.011007,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,45.0,0.0,16.0,14.0
1,10170,0.0,0.155086,0.054729,,1.0,0.051432,0.001498,,0.039382,...,0.002620,0.071551,0.000000,0.001896,0.000058,0.775410,25.0,0.0,,
2,1492,1.0,,,,1.0,0.058594,0.001248,0.722892,0.033031,...,0.000228,0.099438,0.000000,0.012055,0.003579,0.499158,35.0,0.0,,
3,7424,1.0,0.155086,0.004987,,1.0,0.097005,0.000499,0.397590,0.002928,...,0.000052,0.041474,0.000000,0.051692,0.013263,0.301528,55.0,1.0,8.0,15.0
4,4332,1.0,0.155086,0.038148,,1.0,0.034505,0.000375,0.156627,0.001002,...,0.000777,0.005975,0.000000,0.000857,0.019149,0.926208,65.0,0.0,19.0,103.0
5,4483,1.0,0.155086,0.054729,,1.0,0.001953,0.000499,,0.036664,...,0.000569,0.070697,0.000000,0.003048,0.001291,0.896900,55.0,0.0,,
6,7586,1.0,0.035019,0.010514,,1.0,0.000000,0.000000,0.289157,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,45.0,0.0,14.0,22.0
7,828,1.0,0.154708,0.010514,,1.0,0.092448,0.000000,0.385542,0.005903,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,25.0,0.0,7.0,37.0
8,12381,0.0,0.156087,0.024777,,1.0,0.030599,0.000000,,0.029139,...,0.009175,0.105768,0.000000,0.001331,0.000036,0.760074,25.0,2.0,9.0,2.0
9,6377,0.0,,0.010514,,1.0,0.047526,0.004494,0.132530,0.007696,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,75.0,0.0,8.0,56.0


In [18]:
#train_data.groupby('Province').count()

In [68]:
# Check how many NaNs there are in each column, except for the label
# All the rows with NaN as label will be deleted
"""for j in train_data:
    count = 0
    indexes = []
    for i in range(0,len(train_data)):
        if math.isnan(train_data[j][i]):
            count = count + 1
            indexes.append(i)
    #print(train_data.)
    print("There are %d NaNs in Column:'%s'" % (count, j))"""
    
train_data.isnull().sum()

ID                                   0
DeviceFlag4G                         0
DataArpu                          2225
DataAllowanceContinuous           1180
DataAllowanceOneShot              9126
DeviceFlagSmartphone                 0
MonthlyVoiceTrafficCount           339
MonthlySMSTrafficCount             339
EstimatedDevicePrice              5694
MonthlyDataTraffic                 339
CustomerGender                       0
CustomerExpatriate                   0
ZipCode                           1726
ChurnScore                         869
AirportConnectionsDuration           0
AirportConnectionsCount              0
StationConnectionsDuration           0
StationConnectionsCount              0
ParkingConnectionsDuration           0
ParkingConnectionsCount              0
File-Transfer                        0
Games                                0
Instant-Messaging-Applications       0
Mail                                 0
Music-Streaming                      0
Network-Operation        

In [76]:
X = train_data
X = X.drop('DataAllowanceOneShot', axis=1)
X = X.drop('EstimatedDevicePrice', axis=1)
X = X.dropna()

Questo non lo cagate per adesso

In [23]:
prova = train_data.drop('DataAllowanceOneShot', axis=1)
prova = train_data.drop('EstimatedDevicePrice', axis=1)
prova = prova.dropna()
prova.columns

Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DataAllowanceOneShot', 'DeviceFlagSmartphone',
       'MonthlyVoiceTrafficCount', 'MonthlySMSTrafficCount',
       'MonthlyDataTraffic', 'CustomerGender', 'CustomerExpatriate', 'ZipCode',
       'ChurnScore', 'AirportConnectionsDuration', 'AirportConnectionsCount',
       'StationConnectionsDuration', 'StationConnectionsCount',
       'ParkingConnectionsDuration', 'ParkingConnectionsCount',
       'File-Transfer', 'Games', 'Instant-Messaging-Applications', 'Mail',
       'Music-Streaming', 'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'CustomerAge', 'Product', 'Region', 'Province'],
      dtype='object')

In [24]:
prova.isnull().sum()
prova

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DataAllowanceOneShot,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,CustomerGender,...,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,CustomerAge,Product,Region,Province
107,11451,1.0,0.278966,0.061860,0.769231,1.0,0.045573,0.008489,0.088529,1.0,...,0.000065,0.767622,0.000000,0.000813,0.001089,0.199406,45.0,2.0,16.0,97.0
165,3603,0.0,0.139577,0.021568,0.153846,1.0,0.057292,0.003371,0.038114,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,35.0,0.0,3.0,6.0
184,9782,0.0,0.014008,0.016041,0.076923,1.0,0.116536,0.007990,0.057907,1.0,...,0.003440,0.263355,0.000000,0.003813,0.026601,0.401753,25.0,2.0,15.0,77.0
190,6442,1.0,0.232629,0.022994,0.615385,1.0,0.083984,0.000000,0.037874,1.0,...,0.028133,0.406036,0.000000,0.000723,0.000028,0.483747,45.0,2.0,2.0,26.0
201,3650,1.0,0.310672,0.054907,0.153846,1.0,0.048177,0.000125,0.103358,1.0,...,0.002373,0.610747,0.000000,0.000203,0.002375,0.337523,25.0,2.0,6.0,84.0
207,603,1.0,0.149583,0.016576,0.615385,1.0,0.052734,0.000250,0.028679,1.0,...,0.000010,0.405815,0.000000,0.005255,0.036616,0.534568,55.0,0.0,6.0,84.0
225,789,1.0,0.449560,0.138167,0.769231,1.0,0.083333,0.052310,0.113581,1.0,...,0.000674,0.200223,0.000000,0.002428,0.001601,0.756822,35.0,2.0,16.0,97.0
264,10289,0.0,0.369016,0.060434,0.307692,1.0,0.030599,0.002622,0.044800,0.0,...,0.007072,0.023494,0.000000,0.001817,0.000418,0.722508,45.0,0.0,8.0,48.0
276,2447,0.0,0.198244,0.017467,0.153846,1.0,0.062500,0.029089,0.039359,0.0,...,0.002286,0.159328,0.000000,0.004411,0.004796,0.747521,25.0,2.0,2.0,26.0
343,4964,1.0,0.303168,0.027808,0.300248,1.0,0.087891,0.000000,0.093560,0.0,...,0.001042,0.870162,0.000000,0.001218,0.000448,0.108667,15.0,1.0,5.0,75.0


In [42]:
array = np.asarray(prova)

X = array[:,1:prova.shape[1]-1]
X = np.asarray(X)
Y = array[:,prova.shape[1]-1]
Y = np.asarray(Y)

In [43]:
# Before doing this, it is mandatory to transform all the string in floats
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
test = SelectKBest(score_func=chi2, k=3)
fit = test.fit(X,Y)

ValueError: Unknown label type: (array([0.        , 0.7754097 , 0.49915782, ..., 0.62097697, 0.85568787,
       0.70734927]),)