In [42]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Import the train set and test set
train_data = pd.read_csv("trainset.csv", delimiter=",")
test_data = pd.read_csv("testset.csv", delimiter=",")
train_data2 = pd.read_csv("trainset.csv", delimiter=",")

In [43]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [44]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [45]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)


    #print(provinces_sort, provinces_len)
    #print(mapping, provinces)
    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [46]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
    return dataset

In [47]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
    return dataset

In [48]:
def normalize_data_set(dataset):
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    dataset = drop_useless_columns(dataset)
    return dataset

In [49]:
# Fill the missing values with the mean for each column
def fillna_mean_value(dataset):
    # fill missing values with mean column values
    dataset.fillna(dataset.mean(), inplace=True)
    return dataset

In [50]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)

In [10]:
#train_data = fillna_mean_value(train_data)
#test_data = fillna_mean_value(test_data)

In [11]:
train_data.to_csv('normalized_train_dataset.csv', index = False, encoding='utf-8')
test_data.to_csv('normalized_test_dataset.csv', index = False, encoding='utf-8')

In [51]:
print(test_data.shape, train_data.shape)

(3190, 34) (9567, 35)


In [52]:
print(test_data.columns, train_data.columns)

Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ZipCode', 'ChurnScore',
       'AirportConnectionsDuration', 'AirportConnectionsCount',
       'StationConnectionsDuration', 'StationConnectionsCount',
       'ParkingConnectionsDuration', 'ParkingConnectionsCount',
       'File-Transfer', 'Games', 'Instant-Messaging-Applications', 'Mail',
       'Music-Streaming', 'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'CustomerAge', 'Region', 'Province'],
      dtype='object') Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ZipCode', 'Ch

In [12]:
trainset = train_data
trainset.isnull().sum()

ID                                   0
DeviceFlag4G                         0
DataArpu                          2225
DataAllowanceContinuous           1180
DeviceFlagSmartphone                 0
MonthlyVoiceTrafficCount           339
MonthlySMSTrafficCount             339
MonthlyDataTraffic                 339
CustomerGender                       0
CustomerExpatriate                   0
ZipCode                           1726
ChurnScore                         869
AirportConnectionsDuration           0
AirportConnectionsCount              0
StationConnectionsDuration           0
StationConnectionsCount              0
ParkingConnectionsDuration           0
ParkingConnectionsCount              0
File-Transfer                        0
Games                                0
Instant-Messaging-Applications       0
Mail                                 0
Music-Streaming                      0
Network-Operation                    0
P2P-Applications                     0
Security                 

In [13]:
trainset = trainset.drop('DataArpu', axis=1)
trainset = trainset.drop('DataAllowanceContinuous', axis=1)
trainset = trainset.drop('MonthlyVoiceTrafficCount', axis=1)
trainset = trainset.drop('ZipCode', axis=1)
trainset = trainset.drop('ChurnScore', axis=1)
trainset = trainset.drop('CustomerAge', axis=1)
trainset = trainset.drop('MonthlyDataTraffic', axis=1)
trainset = trainset.drop('Province', axis=1)
trainset = trainset.drop('MonthlySMSTrafficCount', axis=1)

In [14]:
trainset.isnull().sum()
train = trainset.dropna()
test = trainset[trainset.columns[trainset.notnull().any(axis = 0)]]

In [53]:
test.shape

(9567, 1)

In [54]:
test = test.drop('Region', axis=1)
train.shape, test.shape

((7568, 26), (9567, 0))

In [55]:
regions_sort = sorted(train_data2['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

mapping = {}
for idx, val in enumerate(regions_sort):
    mapping[val] = int(idx)
mapping

{'ABRUZZI': 0,
 'BASILICATA': 1,
 'CALABRIA': 2,
 'CAMPANIA': 3,
 'EMILIA-ROMAGNA': 4,
 'FRIULI-VENEZIA GIULIA': 5,
 'LAZIO': 6,
 'LIGURIA': 7,
 'LOMBARDIA': 8,
 'MARCHE': 9,
 'MOLISE': 10,
 'PIEMONTE': 11,
 'PUGLIA': 12,
 'SARDEGNA': 13,
 'SICILIA': 14,
 'TOSCANA': 15,
 'TRENTINO-ALTO ADIGE': 16,
 'UMBRIA': 17,
 "VALLE D'AOSTA": 18,
 'VENETO': 19}

In [56]:
mapping ={v: k for k, v in mapping.items()}

In [57]:
mapping

{0: 'ABRUZZI',
 1: 'BASILICATA',
 2: 'CALABRIA',
 3: 'CAMPANIA',
 4: 'EMILIA-ROMAGNA',
 5: 'FRIULI-VENEZIA GIULIA',
 6: 'LAZIO',
 7: 'LIGURIA',
 8: 'LOMBARDIA',
 9: 'MARCHE',
 10: 'MOLISE',
 11: 'PIEMONTE',
 12: 'PUGLIA',
 13: 'SARDEGNA',
 14: 'SICILIA',
 15: 'TOSCANA',
 16: 'TRENTINO-ALTO ADIGE',
 17: 'UMBRIA',
 18: "VALLE D'AOSTA",
 19: 'VENETO'}

In [58]:
train_data['Region'].describe()
regions_sort = sorted(train_data['Region'].dropna().unique())
regions_sort
count = train_data.groupby('Region').count()
count

regions = train_data[['ID','Region']]
regions

count = regions.groupby('Region').count()
count = pd.DataFrame({'Count': count['ID']})
#regions = regions.join(count)

count
per = round(count/(len(train_data['Region'].dropna(axis=0)))*100)
per = pd.DataFrame({'Percentage': per['Count']})

count = count.join(per)
count
add = round(count['Percentage']/100*1999)
add = pd.DataFrame({'RegionToAdd': add})

count = count.join(add)
count



Unnamed: 0_level_0,Count,Percentage,RegionToAdd
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,162,2.0,40.0
1.0,111,1.0,20.0
2.0,278,4.0,80.0
3.0,652,9.0,180.0
4.0,418,6.0,120.0
5.0,126,2.0,40.0
6.0,787,10.0,200.0
7.0,309,4.0,80.0
8.0,1449,19.0,380.0
9.0,127,2.0,40.0


In [59]:
count.RegionToAdd.sum()

2000.0

In [60]:
train_data.shape
test = trainset[trainset.columns[trainset.isnull().any(axis = 0)]]
count.index[0]
test

Unnamed: 0,Region
0,16.0
1,
2,
3,8.0
4,19.0
5,
6,14.0
7,7.0
8,9.0
9,8.0


In [61]:
count['RegionToAdd'][0]

40.0

In [62]:
import math
# test = trainset[trainset.columns[trainset.isnull().any(axis = 0)]]
le = len(test)
ct = count['RegionToAdd'][0]
j = 0
for i in range(le):
    if(math.isnan(test['Region'][i])):
        test['Region'][i] = count.index[j]
        #print("ct: " + str(ct) + " region: " + str(count.index[j]) + " J: " + str(j))
        ct -=1
        if(ct <= 0.0):
            j += 1
            ct = count['RegionToAdd'][j]
test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Region
0,16.0
1,0.0
2,0.0
3,8.0
4,19.0
5,0.0
6,14.0
7,7.0
8,9.0
9,8.0


In [25]:
final_train = train_data
test['Region'].head(44)
final_train = final_train.drop('Region', axis=1)
final_train = final_train.join(test['Region'])
final_train.Region.describe()

count    9567.000000
mean        9.137347
std         4.999765
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64

In [26]:
final_train = final_train.sort_values(by='Region')
final_province = final_train['Province'].fillna(method='ffill')
final_province
final_train = final_train.drop('Province', axis=1)
final_train = final_train.join(final_province)
final_train

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,CustomerGender,CustomerExpatriate,...,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,CustomerAge,Product,Region,Province
6528,10511,0.0,0.186103,0.010514,1.0,0.028646,0.000250,0.006104,0.0,0.0,...,0.001731,0.378368,0.000000,0.024176,0.006683,0.549398,35.0,0,0.0,93.0
2817,4174,1.0,0.108560,0.038148,1.0,0.133464,0.009238,0.015368,1.0,0.0,...,0.011461,0.102020,0.000000,0.020124,0.011002,0.649518,55.0,2,0.0,71.0
660,4920,1.0,,0.013367,1.0,0.037109,0.002622,0.020042,0.0,1.0,...,0.001541,0.231534,0.000000,0.002384,0.004774,0.719529,45.0,2,0.0,42.0
159,7007,0.0,,,0.0,,,,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,2,0.0,42.0
5469,4129,0.0,0.124069,0.016219,1.0,0.028646,0.001748,0.024390,0.0,0.0,...,0.000013,0.615545,0.000000,0.000415,0.000474,0.365691,65.0,0,0.0,93.0
5489,3885,0.0,0.155086,0.038148,1.0,0.020833,0.000999,0.012410,1.0,0.0,...,0.017807,0.069277,0.000000,0.000931,0.000075,0.838331,45.0,0,0.0,24.0
1470,2269,1.0,0.232629,0.054729,1.0,0.005208,0.000250,0.000478,1.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,65.0,0,0.0,24.0
8519,10174,0.0,0.139577,0.021568,0.0,0.066406,0.000250,0.021907,0.0,0.0,...,0.002330,0.009797,0.000000,0.006117,0.002259,0.862120,25.0,0,0.0,24.0
2718,4810,0.0,0.277465,0.054729,1.0,0.141927,0.000000,0.074345,1.0,0.0,...,0.003874,0.397703,0.000000,0.000279,0.001592,0.534153,25.0,0,0.0,93.0
5563,948,0.0,0.031017,0.111780,1.0,0.091146,0.002622,0.087819,0.0,0.0,...,0.003233,0.174018,0.000000,0.074186,0.010749,0.539805,65.0,1,0.0,71.0


In [29]:
train_data['CustomerAge'].describe()
age_sort = sorted(train_data['CustomerAge'].dropna().unique())
age_sort
count = train_data.groupby('CustomerAge').count()
count

ages = train_data[['ID','CustomerAge']]
ages

count = ages.groupby('CustomerAge').count()
count = pd.DataFrame({'Count': count['ID']})
#regions = regions.join(count)

count
per = round(count/(len(train_data['CustomerAge'].dropna(axis=0)))*100)
per = pd.DataFrame({'Percentage': per['Count']})

count = count.join(per)
count
add = round(count['Percentage']/100*410)
add = pd.DataFrame({'AgesToAdd': add})

count = count.join(add)
count


Unnamed: 0_level_0,Count,Percentage,AgesToAdd
CustomerAge,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15.0,205,2.0,8.0
25.0,1489,16.0,66.0
35.0,1788,20.0,82.0
45.0,2329,25.0,102.0
55.0,1970,22.0,90.0
65.0,963,11.0,45.0
75.0,353,4.0,16.0
85.0,60,1.0,4.0


In [35]:
count['AgesToAdd'].sum()
count['AgesToAdd'][1]

KeyError: 1.0

In [31]:
import math
test = trainset[trainset.columns[trainset.isnull().any(axis = 0)]]
le = len(test)
ct = count['AgesToAdd'][0]
j = 0
for i in range(le):
    if(math.isnan(test['CustomerAge'][i])):
        test['CustomerAge'][i] = count.index[j]
        #print("ct: " + str(ct) + " region: " + str(count.index[j]) + " J: " + str(j))
        ct -=1
        if(ct <= 0.0):
            j += 1
            ct = count['AgesToAdd'][j]
test

KeyError: 0.0

In [27]:
# final_age = final_train.CustomerAge.fillna(final_train.CustomerAge.mean())
# final_train = final_train.drop('CustomerAge', axis=1)
# final_train = final_train.join(final_age)
# final_train

In [29]:
final_train = final_train.fillna(method='ffill')
final_train.isnull().sum()

ID                                0
DeviceFlag4G                      0
DataArpu                          0
DataAllowanceContinuous           0
DeviceFlagSmartphone              0
MonthlyVoiceTrafficCount          0
MonthlySMSTrafficCount            0
MonthlyDataTraffic                0
CustomerGender                    0
CustomerExpatriate                0
ZipCode                           0
ChurnScore                        0
AirportConnectionsDuration        0
AirportConnectionsCount           0
StationConnectionsDuration        0
StationConnectionsCount           0
ParkingConnectionsDuration        0
ParkingConnectionsCount           0
File-Transfer                     0
Games                             0
Instant-Messaging-Applications    0
Mail                              0
Music-Streaming                   0
Network-Operation                 0
P2P-Applications                  0
Security                          0
Streaming-Applications            0
Terminals                   

In [30]:
test_data.isnull().sum()

ID                                  0
DeviceFlag4G                        0
DataArpu                          757
DataAllowanceContinuous           388
DeviceFlagSmartphone                0
MonthlyVoiceTrafficCount          110
MonthlySMSTrafficCount            110
MonthlyDataTraffic                110
CustomerGender                      0
CustomerExpatriate                  0
ZipCode                           575
ChurnScore                        303
AirportConnectionsDuration          0
AirportConnectionsCount             0
StationConnectionsDuration          0
StationConnectionsCount             0
ParkingConnectionsDuration          0
ParkingConnectionsCount             0
File-Transfer                       0
Games                               0
Instant-Messaging-Applications      0
Mail                                0
Music-Streaming                     0
Network-Operation                   0
P2P-Applications                    0
Security                            0
Streaming-Ap

In [31]:
testset = test_data
testset = testset.drop('DataArpu', axis=1)
testset = testset.drop('DataAllowanceContinuous', axis=1)
testset = testset.drop('MonthlyVoiceTrafficCount', axis=1)
testset = testset.drop('ZipCode', axis=1)
testset = testset.drop('ChurnScore', axis=1)
testset = testset.drop('CustomerAge', axis=1)
testset = testset.drop('MonthlyDataTraffic', axis=1)
testset = testset.drop('Province', axis=1)
testset = testset.drop('MonthlySMSTrafficCount', axis=1)

In [32]:
test_data['Region'].describe()
regions_sort = sorted(test_data['Region'].dropna().unique())
regions_sort
count = test_data.groupby('Region').count()
count

regions = test_data[['ID','Region']]
regions

count = regions.groupby('Region').count()
count = pd.DataFrame({'Count': count['ID']})
#regions = regions.join(count)

count
per = round(count/(len(test_data['Region'].dropna(axis=0)))*100)
per = pd.DataFrame({'Percentage': per['Count']})

count = count.join(per)
count
add = round(count['Percentage']/100*666)
add = pd.DataFrame({'RegionToAdd': add})

count = count.join(add)
count

Unnamed: 0_level_0,Count,Percentage,RegionToAdd
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,65,3.0,20.0
1.0,43,2.0,13.0
2.0,97,4.0,27.0
3.0,203,8.0,53.0
4.0,123,5.0,33.0
5.0,37,1.0,7.0
6.0,246,10.0,67.0
7.0,103,4.0,27.0
8.0,486,19.0,127.0
9.0,39,2.0,13.0


In [33]:
test_data.shape
test = test_data[test_data.columns[test_data.isnull().any(axis = 0)]]
count.index[0]
test

Unnamed: 0,DataArpu,DataAllowanceContinuous,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,ZipCode,ChurnScore,CustomerAge,Region,Province
0,0.155086,0.043853,0.268229,0.000375,0.077037,70010.0,0.367813,25.0,12.0,7.0
1,0.108560,0.021746,0.016927,0.000000,0.016237,80021.0,0.084914,25.0,3.0,59.0
2,0.263457,0.054729,0.050130,0.000000,0.029598,55040.0,0.269867,55.0,15.0,49.0
3,,,,,,,,,,
4,0.309983,0.060434,0.079427,0.000999,0.037898,16153.0,0.080718,35.0,7.0,37.0
5,0.400149,0.006413,0.058594,0.015231,0.006337,9010.0,0.176372,45.0,13.0,17.0
6,0.112562,0.021568,0.148438,0.000375,0.008432,30174.0,0.033990,55.0,19.0,103.0
7,0.155086,0.043853,0.052734,0.000375,0.039639,67039.0,0.009130,25.0,0.0,42.0
8,0.154708,0.010514,0.040365,0.000624,0.008304,,0.143126,25.0,,
9,0.108560,0.054729,0.018880,0.001623,0.021754,20871.0,0.080713,25.0,8.0,58.0


In [34]:
test = testset[testset.columns[testset.isnull().any(axis = 0)]]
count.index[0]
test

Unnamed: 0,Region
0,12.0
1,3.0
2,15.0
3,
4,7.0
5,13.0
6,19.0
7,0.0
8,
9,8.0


In [35]:
import math
test = testset[testset.columns[testset.isnull().any(axis = 0)]]
le = len(test)
ct = count['RegionToAdd'][0]
j = 0
for i in range(le):
    if(math.isnan(test['Region'][i])):
        test['Region'][i] = count.index[j]
        #print("ct: " + str(ct) + " region: " + str(count.index[j]) + " J: " + str(j))
        ct -=1
        if(ct <= 0.0):
            j += 1
            ct = count['RegionToAdd'][j]
test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Region
0,12.0
1,3.0
2,15.0
3,0.0
4,7.0
5,13.0
6,19.0
7,0.0
8,0.0
9,8.0


In [36]:
final_test = test_data
final_test = final_test.drop('Region', axis=1)
final_test = final_test.join(test['Region'])
final_test.Region.describe()

count    3190.000000
mean        9.234796
std         5.094994
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64

In [37]:
final_test = final_test.sort_values(by='Region')
final_province = final_test['Province'].fillna(method='ffill')
final_province
final_test = final_test.drop('Province', axis=1)
final_test = final_test.join(final_province)
final_test

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,CustomerGender,CustomerExpatriate,...,P2P-Applications,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,CustomerAge,Region,Province
2825,10909,1.0,0.278966,0.088247,1.0,0.087240,0.002372,0.117077,0.0,0.0,...,1.191584e-03,0.000095,0.094630,0.000000e+00,0.000751,0.010485,0.866945,25.0,0.0,71.0
1527,11055,1.0,0.511595,0.054729,1.0,0.143229,0.001873,0.023151,1.0,0.0,...,5.463317e-05,0.007726,0.097810,0.000000e+00,0.002750,0.007511,0.522451,55.0,0.0,42.0
2090,1346,0.0,0.108560,0.043853,1.0,0.035807,0.000125,0.021345,1.0,1.0,...,0.000000e+00,0.001035,0.210621,4.175634e-06,0.005871,0.001355,0.280302,55.0,0.0,42.0
138,11102,0.0,0.232629,0.027273,1.0,0.127604,0.001873,0.036968,0.0,0.0,...,1.019815e-04,0.002331,0.217483,0.000000e+00,0.002814,0.000254,0.687266,35.0,0.0,24.0
366,7405,0.0,0.449560,0.115703,1.0,0.219401,0.002871,0.086984,1.0,0.0,...,9.328607e-07,0.000699,0.420700,0.000000e+00,0.005276,0.000626,0.512064,35.0,0.0,42.0
763,7997,1.0,0.232629,0.054729,1.0,0.117188,0.002747,0.051845,1.0,0.0,...,4.907679e-05,0.032214,0.263554,0.000000e+00,0.001407,0.000310,0.622411,35.0,0.0,42.0
2317,11283,1.0,0.108560,0.016041,1.0,0.017578,0.000749,0.006729,1.0,0.0,...,0.000000e+00,0.014806,0.007189,0.000000e+00,0.010173,0.000397,0.793446,65.0,0.0,24.0
2892,1634,0.0,0.232629,0.054729,1.0,0.027344,0.006866,0.067243,1.0,0.0,...,2.192990e-03,0.000645,0.205426,0.000000e+00,0.003420,0.001067,0.687632,35.0,0.0,42.0
116,3490,0.0,,,1.0,0.053385,0.001124,0.094825,0.0,0.0,...,1.334996e-03,0.010495,0.039624,0.000000e+00,0.000575,0.000090,0.898188,15.0,0.0,42.0
115,2187,0.0,,0.010514,0.0,0.001302,0.000000,0.000000,0.0,0.0,...,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,45.0,0.0,42.0


In [38]:
final_age = final_test.CustomerAge.fillna(final_test.CustomerAge.mean())
final_test = final_test.drop('CustomerAge', axis=1)
final_test = final_test.join(final_age)
final_test

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,CustomerGender,CustomerExpatriate,...,P2P-Applications,Security,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,Region,Province,CustomerAge
2825,10909,1.0,0.278966,0.088247,1.0,0.087240,0.002372,0.117077,0.0,0.0,...,1.191584e-03,0.000095,0.094630,0.000000e+00,0.000751,0.010485,0.866945,0.0,71.0,25.0
1527,11055,1.0,0.511595,0.054729,1.0,0.143229,0.001873,0.023151,1.0,0.0,...,5.463317e-05,0.007726,0.097810,0.000000e+00,0.002750,0.007511,0.522451,0.0,42.0,55.0
2090,1346,0.0,0.108560,0.043853,1.0,0.035807,0.000125,0.021345,1.0,1.0,...,0.000000e+00,0.001035,0.210621,4.175634e-06,0.005871,0.001355,0.280302,0.0,42.0,55.0
138,11102,0.0,0.232629,0.027273,1.0,0.127604,0.001873,0.036968,0.0,0.0,...,1.019815e-04,0.002331,0.217483,0.000000e+00,0.002814,0.000254,0.687266,0.0,24.0,35.0
366,7405,0.0,0.449560,0.115703,1.0,0.219401,0.002871,0.086984,1.0,0.0,...,9.328607e-07,0.000699,0.420700,0.000000e+00,0.005276,0.000626,0.512064,0.0,42.0,35.0
763,7997,1.0,0.232629,0.054729,1.0,0.117188,0.002747,0.051845,1.0,0.0,...,4.907679e-05,0.032214,0.263554,0.000000e+00,0.001407,0.000310,0.622411,0.0,42.0,35.0
2317,11283,1.0,0.108560,0.016041,1.0,0.017578,0.000749,0.006729,1.0,0.0,...,0.000000e+00,0.014806,0.007189,0.000000e+00,0.010173,0.000397,0.793446,0.0,24.0,65.0
2892,1634,0.0,0.232629,0.054729,1.0,0.027344,0.006866,0.067243,1.0,0.0,...,2.192990e-03,0.000645,0.205426,0.000000e+00,0.003420,0.001067,0.687632,0.0,42.0,35.0
116,3490,0.0,,,1.0,0.053385,0.001124,0.094825,0.0,0.0,...,1.334996e-03,0.010495,0.039624,0.000000e+00,0.000575,0.000090,0.898188,0.0,42.0,15.0
115,2187,0.0,,0.010514,0.0,0.001302,0.000000,0.000000,0.0,0.0,...,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.0,42.0,45.0


In [39]:
final_test = final_test.fillna(method='ffill')
final_test.isnull().sum()

ID                                0
DeviceFlag4G                      0
DataArpu                          0
DataAllowanceContinuous           0
DeviceFlagSmartphone              0
MonthlyVoiceTrafficCount          0
MonthlySMSTrafficCount            0
MonthlyDataTraffic                0
CustomerGender                    0
CustomerExpatriate                0
ZipCode                           0
ChurnScore                        0
AirportConnectionsDuration        0
AirportConnectionsCount           0
StationConnectionsDuration        0
StationConnectionsCount           0
ParkingConnectionsDuration        0
ParkingConnectionsCount           0
File-Transfer                     0
Games                             0
Instant-Messaging-Applications    0
Mail                              0
Music-Streaming                   0
Network-Operation                 0
P2P-Applications                  0
Security                          0
Streaming-Applications            0
Terminals                   

In [40]:
pt = final_train.Product
final_train = final_train.drop('Product', axis=1)
final_train = final_train.join(pt)

In [41]:
final_train.shape, final_test.shape

((9567, 35), (3190, 34))

In [42]:
final_train.to_csv('normalized_train_dataset_dis.csv', index = False, encoding='utf-8')
final_test.to_csv('normalized_test_dataset_dis.csv', index = False, encoding='utf-8')