In [62]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Import the train set and test set
train_data = pd.read_csv("trainset.csv", delimiter=",")
test_data = pd.read_csv("testset.csv", delimiter=",")

In [63]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [64]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [65]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)


    #print(provinces_sort, provinces_len)
    #print(mapping, provinces)
    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [66]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
    return dataset

In [67]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns and 'ZipCode' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
        dataset = dataset.drop('ZipCode', axis=1)
    return dataset

In [68]:
def add_modified_sample(dataset):
    modified = []
    for i in dataset.isnull().any(axis=1):
        if(i):
            modified.append(1)
        else:
            modified.append(0)
    modified = np.asarray(modified)
    modified = pd.DataFrame({'IsModified': modified})
    dataset = dataset.join(modified)
    return dataset

In [69]:
def normalize_data_set(dataset):
    dataset = drop_useless_columns(dataset)
    dataset = add_modified_sample(dataset)
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    return dataset

In [70]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)
train_data.head()

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,CustomerGender,CustomerExpatriate,...,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,IsModified,CustomerAge,Region,Province,Product
0,10930,1.0,0.156221,0.010514,1.0,0.018229,0.001623,0.011007,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,45.0,16.0,14.0,0
1,10170,0.0,0.155086,0.054729,1.0,0.051432,0.001498,0.039382,0.0,0.0,...,0.071551,0.0,0.001896,5.8e-05,0.77541,1,25.0,,,0
2,1492,1.0,,,1.0,0.058594,0.001248,0.033031,1.0,1.0,...,0.099438,0.0,0.012055,0.003579,0.499158,1,35.0,,,0
3,7424,1.0,0.155086,0.004987,1.0,0.097005,0.000499,0.002928,0.0,0.0,...,0.041474,0.0,0.051692,0.013263,0.301528,0,55.0,8.0,15.0,2
4,4332,1.0,0.155086,0.038148,1.0,0.034505,0.000375,0.001002,1.0,0.0,...,0.005975,0.0,0.000857,0.019149,0.926208,0,65.0,19.0,103.0,0


In [71]:
train_data.columns

Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ChurnScore', 'AirportConnectionsDuration',
       'AirportConnectionsCount', 'StationConnectionsDuration',
       'StationConnectionsCount', 'ParkingConnectionsDuration',
       'ParkingConnectionsCount', 'File-Transfer', 'Games',
       'Instant-Messaging-Applications', 'Mail', 'Music-Streaming',
       'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'IsModified', 'CustomerAge', 'Region', 'Province',
       'Product'],
      dtype='object')

In [56]:
# binary: DeviceFlag4G, DeviceFlagSmartphone, CustomerGender, CustomerExpatriate

# CHECKPOINT

In [72]:
trainset = train_data
testset = test_data

## Regions

In [86]:
import math

def fill_nan(field, datap):
    data = datap
    nnan = data.isnull()[field].sum()
    field_id = data[['ID',field]]
    field_count = field_id.groupby(field).count()
    field_per = round(field_count/(len(data[field].dropna(axis=0)))*100)
    field_2badd = round(field_per/100*nnan)
    
    print(len(field_2badd))
    print(field_2badd)
    
    le = len(data)
    ct = field_2badd.values[0]
    j = 0
    for i in range(le):
        print(ct)
        if(math.isnan(data[field][i])):
            data[field][i] = field_2badd.index[j]
            ct -=1
            if(ct <= 0.0 and j < len(field_2badd)):
                j += 1
                print(ct)
                ct = field_2badd.values[j]
    return data

In [74]:
trainset = fill_nan('Region', trainset)
testset = fill_nan('Region', testset)
trainset.Region.head(),testset.Region.head()

[40.]
[40.]
[39.]
[38.]
[38.]
[38.]
[37.]
[37.]
[37.]
[37.]
[37.]
[37.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[35.]
[35.]
[35.]
[34.]
[34.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[32.]
[32.]
[32.]
[32.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[30.]
[30.]
[30.]
[30.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[28.]
[28.]
[28.]
[27.]
[27.]
[27.]
[27.]
[27.]
[26.]
[26.]
[26.]
[26.]
[25.]
[25.]
[25.]
[25.]
[25.]
[25.]
[25.]
[25.]
[24.]
[23.]
[23.]
[23.]
[23.]
[22.]
[22.]
[21.]
[21.]
[20.]
[19.]
[19.]
[19.]
[19.]
[19.]
[19.]
[18.]
[18.]
[17.]
[16.]
[16.]
[15.]
[15.]
[14.]
[13.]
[12.]
[12.]
[12.]
[12.]
[12.]
[12.]
[12.]
[12.]
[12.]
[12.]
[11.]
[11.]
[11.]
[11.]
[11.]
[11.]
[11.]
[11.]
[11.]
[11.]
[11.]
[10.]
[10.]
[10.]
[10.]
[10.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]


[54.]
[53.]
[53.]
[53.]
[53.]
[53.]
[53.]
[53.]
[52.]
[52.]
[52.]
[52.]
[52.]
[52.]
[51.]
[51.]
[51.]
[51.]
[51.]
[50.]
[50.]
[49.]
[48.]
[47.]
[46.]
[46.]
[46.]
[45.]
[45.]
[44.]
[44.]
[44.]
[44.]
[44.]
[44.]
[43.]
[42.]
[42.]
[41.]
[41.]
[40.]
[40.]
[40.]
[40.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[38.]
[38.]
[38.]
[37.]
[37.]
[37.]
[37.]
[37.]
[37.]
[37.]
[37.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[35.]
[35.]
[35.]
[34.]
[34.]
[34.]
[34.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[33.]
[32.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[31.]
[30.]
[30.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[28.]
[28.]
[28.]
[27.]
[27.]
[26.]
[26.]
[26.]
[26.]
[26.]
[26.]
[26.]
[25.]
[25.]
[25.]
[25.]
[25.]
[25.]
[25.]
[24.]
[23.]
[23.]
[22.]
[22.]
[21.]
[21.]
[20.]
[20.]
[20.]
[19.]
[19.]
[19.]
[19.]
[19.]
[18.]
[18.]
[17.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[15.]
[14.]
[14.]
[14.]
[14.]
[14.]
[13.]
[13.]
[13.]
[13.]
[13.

[102.]
[101.]
[101.]
[101.]
[101.]
[100.]
[99.]
[99.]
[99.]
[99.]
[99.]
[99.]
[99.]
[99.]
[98.]
[98.]
[98.]
[98.]
[98.]
[98.]
[98.]
[98.]
[98.]
[97.]
[97.]
[96.]
[96.]
[96.]
[96.]
[96.]
[96.]
[96.]
[95.]
[95.]
[95.]
[95.]
[95.]
[95.]
[95.]
[95.]
[95.]
[95.]
[94.]
[94.]
[94.]
[94.]
[94.]
[94.]
[94.]
[94.]
[94.]
[94.]
[93.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[92.]
[91.]
[91.]
[91.]
[91.]
[90.]
[89.]
[89.]
[89.]
[89.]
[88.]
[88.]
[87.]
[86.]
[86.]
[85.]
[85.]
[85.]
[84.]
[84.]
[84.]
[83.]
[83.]
[83.]
[83.]
[83.]
[82.]
[82.]
[82.]
[82.]
[82.]
[81.]
[80.]
[80.]
[80.]
[80.]
[80.]
[80.]
[80.]
[80.]
[79.]
[78.]
[77.]
[77.]
[77.]
[76.]
[76.]
[76.]
[76.]
[75.]
[75.]
[75.]
[74.]
[74.]
[74.]
[74.]
[74.]
[74.]
[73.]
[73.]
[73.]
[73.]
[72.]
[71.]
[71.]
[71.]
[71.]
[71.]
[71.]
[71.]
[70.]
[69.]
[69.]
[69.]
[69.]
[68.]
[68.]
[68.]
[68.]
[68.]
[68.]
[68.]
[68.]
[68.]
[68.]
[68.]
[67.]
[67.]
[66.]
[66.]
[66.]
[66.]
[66.]
[66.]
[66.

[276.]
[276.]
[276.]
[275.]
[274.]
[274.]
[273.]
[273.]
[273.]
[273.]
[272.]
[272.]
[272.]
[272.]
[272.]
[272.]
[271.]
[270.]
[270.]
[269.]
[269.]
[269.]
[269.]
[269.]
[268.]
[268.]
[268.]
[268.]
[267.]
[267.]
[267.]
[266.]
[266.]
[266.]
[266.]
[266.]
[265.]
[264.]
[264.]
[263.]
[263.]
[263.]
[263.]
[263.]
[263.]
[262.]
[262.]
[262.]
[262.]
[262.]
[262.]
[261.]
[261.]
[260.]
[260.]
[260.]
[260.]
[259.]
[258.]
[258.]
[258.]
[258.]
[257.]
[257.]
[257.]
[257.]
[256.]
[256.]
[256.]
[256.]
[256.]
[256.]
[256.]
[255.]
[254.]
[254.]
[254.]
[253.]
[253.]
[252.]
[252.]
[252.]
[252.]
[252.]
[251.]
[251.]
[251.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[250.]
[249.]
[248.]
[248.]
[248.]
[247.]
[247.]
[246.]
[246.]
[245.]
[245.]
[245.]
[245.]
[244.]
[244.]
[243.]
[242.]
[242.]
[242.]
[241.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[240.]
[239.]
[239.]
[238.]
[237.]
[237.]
[237.]
[237.]
[237.]

[35.]
[35.]
[34.]
[34.]
[34.]
[33.]
[33.]
[33.]
[33.]
[32.]
[31.]
[31.]
[31.]
[30.]
[30.]
[30.]
[30.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[28.]
[28.]
[28.]
[28.]
[28.]
[27.]
[27.]
[27.]
[27.]
[26.]
[26.]
[26.]
[26.]
[26.]
[25.]
[25.]
[25.]
[25.]
[24.]
[24.]
[24.]
[24.]
[24.]
[24.]
[24.]
[24.]
[24.]
[23.]
[23.]
[23.]
[23.]
[23.]
[23.]
[23.]
[23.]
[23.]
[22.]
[22.]
[21.]
[21.]
[21.]
[21.]
[21.]
[21.]
[21.]
[20.]
[20.]
[20.]
[19.]
[18.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[16.]
[15.]
[14.]
[14.]
[14.]
[14.]
[14.]
[14.]
[14.]
[14.]
[14.]
[14.]
[13.]
[13.]
[12.]
[11.]
[11.]
[10.]
[10.]
[9.]
[9.]
[9.]
[9.]
[8.]
[8.]
[7.]
[7.]
[6.]
[6.]
[6.]
[6.]
[5.]
[5.]
[4.]
[3.]
[3.]
[3.]
[3.]
[3.]
[3.]
[2.]
[2.]
[2.]
[2.]
[2.]
[1.]
[0.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[20.]
[19.]
[19.]
[18.]
[18.]
[18.]
[18.]
[18.]
[18.]
[18.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[17.]
[16.]
[16.]
[15.]
[15.]
[15.]
[1

[70.]
[70.]
[70.]
[70.]
[70.]
[70.]
[70.]
[70.]
[70.]
[70.]
[70.]
[69.]
[69.]
[68.]
[68.]
[68.]
[67.]
[67.]
[67.]
[67.]
[66.]
[66.]
[65.]
[65.]
[65.]
[65.]
[65.]
[65.]
[64.]
[63.]
[63.]
[63.]
[63.]
[62.]
[62.]
[62.]
[62.]
[61.]
[60.]
[59.]
[59.]
[58.]
[58.]
[58.]
[58.]
[57.]
[57.]
[57.]
[56.]
[56.]
[56.]
[56.]
[56.]
[56.]
[56.]
[56.]
[56.]
[55.]
[55.]
[55.]
[55.]
[55.]
[55.]
[55.]
[54.]
[54.]
[54.]
[54.]
[54.]
[54.]
[54.]
[54.]
[54.]
[53.]
[53.]
[53.]
[53.]
[53.]
[52.]
[52.]
[52.]
[52.]
[52.]
[52.]
[52.]
[52.]
[51.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[49.]
[49.]
[48.]
[48.]
[48.]
[48.]
[48.]
[48.]
[47.]
[47.]
[47.]
[47.]
[47.]
[47.]
[46.]
[46.]
[45.]
[44.]
[44.]
[43.]
[43.]
[43.]
[42.]
[42.]
[42.]
[42.]
[42.]
[42.]
[42.]
[41.]
[41.]
[41.]
[41.]
[40.]
[40.]
[40.]
[40.]
[39.]
[39.]
[39.]
[39.]
[39.]
[39.]
[38.]
[37.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[35.]
[34.]
[34.]
[33.]
[33.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.

[68.]
[68.]
[67.]
[67.]
[66.]
[66.]
[66.]
[65.]
[64.]
[63.]
[63.]
[63.]
[63.]
[63.]
[63.]
[63.]
[63.]
[62.]
[62.]
[61.]
[61.]
[61.]
[61.]
[61.]
[61.]
[61.]
[61.]
[61.]
[61.]
[60.]
[60.]
[60.]
[60.]
[59.]
[59.]
[59.]
[59.]
[59.]
[59.]
[58.]
[57.]
[57.]
[57.]
[57.]
[57.]
[56.]
[56.]
[55.]
[54.]
[54.]
[54.]
[54.]
[54.]
[53.]
[52.]
[52.]
[52.]
[51.]
[51.]
[51.]
[51.]
[51.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[49.]
[48.]
[48.]
[48.]
[48.]
[48.]
[47.]
[47.]
[46.]
[45.]
[45.]
[45.]
[45.]
[45.]
[44.]
[43.]
[43.]
[43.]
[43.]
[43.]
[43.]
[43.]
[42.]
[41.]
[41.]
[41.]
[40.]
[40.]
[39.]
[39.]
[39.]
[38.]
[38.]
[37.]
[37.]
[37.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[36.]
[35.]
[35.]
[35.]
[35.]
[35.]
[35.]
[34.]
[34.]
[34.]
[34.]
[33.]
[33.]
[32.]
[32.]
[32.]
[32.]
[32.]
[32.]
[31.]
[31.]
[31.]
[30.]
[30.]
[30.]
[30.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[29.]
[28.]
[28.]
[28.]
[28.]
[28.]
[27.]
[27.]
[27.]
[27.]
[27.]
[26.]
[26.

[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[4.]
[4.]
[4.]
[3.]
[3.]
[3.]
[3.]
[2.]
[2.]
[2.]
[2.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.]
[27.]
[27.]
[26.]
[26.]
[26.]
[26.]
[26.]
[26.]
[26.]
[26.]
[26.]
[25.]
[25.]
[25.]
[25.]
[25.]
[25.]
[24.]
[23.]
[23.]
[23.]
[22.]
[22.]
[22.]
[21.]
[21.]
[20.]
[20.]
[20.]
[20.]
[19.]
[19.]
[19.]
[19.]
[18.]
[18.]
[18.]
[18.]
[18.]
[18.]
[17.]
[17.]
[17.]
[17.]
[17.]
[16.]
[16.]
[15.]
[15.]
[14.]
[13.]
[13.]
[12.]
[11.]
[11.]
[11.]
[11.]
[10.]
[10.]
[10.]
[10.]
[9.]
[9.]
[8.]
[8.]
[8.]
[8.]
[8.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[6.]
[6.]
[5.]
[5.]
[5.]
[5.]
[5.]
[4.]
[4.]
[4.]
[4.]
[4.]
[3.]
[2.]
[1.]
[0.]
[53.]
[53.]
[53.]
[53.]
[52.]
[51.]
[50.]
[50.]
[50.]
[50.]
[50.]
[50.]
[49.]
[49.]
[49.]
[48.]
[48.]
[48.]
[48.]
[47.]
[47.]
[46.]
[46.]
[46.]
[46.]
[46.]
[45.]
[45.]
[45.]
[45.]
[45.]
[45.]
[45.]
[44.]
[44.]
[44.]
[44.]
[44.]
[44.]
[44.]
[44.]
[43.]
[43.]
[42.]
[42.]
[41.]
[41.]
[41.]
[

[19.]
[19.]
[19.]
[19.]
[19.]
[19.]
[19.]
[18.]
[18.]
[17.]
[17.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[16.]
[15.]
[14.]
[14.]
[14.]
[14.]
[13.]
[12.]
[12.]
[11.]
[11.]
[11.]
[11.]
[10.]
[10.]
[10.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[8.]
[8.]
[8.]
[8.]
[8.]
[7.]
[7.]
[6.]
[6.]
[6.]
[6.]
[6.]
[6.]
[5.]
[4.]
[3.]
[3.]
[3.]
[3.]
[2.]
[2.]
[2.]
[2.]
[2.]
[2.]
[2.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.]
[13.]
[13.]
[13.]
[13.]
[13.]
[13.]
[12.]
[12.]
[11.]
[11.]
[11.]
[10.]
[10.]
[10.]
[10.]
[9.]
[9.]
[8.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[7.]
[6.]
[5.]
[4.]
[4.]
[4.]
[4.]
[4.]
[4.]
[4.]
[3.]
[2.]
[2.]
[2.]
[2.]
[2.]
[2.]
[2.]
[2.]
[1.]
[1.]
[1.]
[1.]
[1.]
[0.]
[0.]
[0.]
[-1.]
[60.]
[60.]
[60.]
[59.]
[59.]
[59.]
[59.]
[59.]
[59.]
[59.]
[58.]
[58.]
[58.]
[57.]
[57.]
[56.]


[13.]
[13.]
[13.]
[13.]
[13.]
[12.]
[12.]
[11.]
[11.]
[10.]
[10.]
[10.]
[10.]
[10.]
[10.]
[10.]
[10.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[9.]
[8.]
[8.]
[8.]
[8.]
[8.]
[8.]
[7.]
[6.]
[6.]
[6.]
[6.]
[6.]
[6.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[5.]
[4.]
[4.]


(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    12.0
 1     3.0
 2    15.0
 3     0.0
 4     7.0
 Name: Region, dtype: float64)

In [76]:
print(trainset.shape)
print(trainset['Region'].describe())

(9567, 35)
count    9567.000000
mean        9.137347
std         4.999765
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64


In [77]:
print(testset.shape)
print(testset['Region'].describe())

(3190, 34)
count    3190.000000
mean        9.234796
std         5.094994
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64


In [78]:
def ohe_region(datap):
    data = datap
    region_ohe = data['Region']

    nord_regions = [4,5,7,8,11,17,18,19]
    nord = []
    centro_regions = [0,6,9,10,15,16]
    centro = []
    sud_regions = [1,2,3,12,13,14]
    sud = []

    for i in region_ohe.values:
        if i in nord_regions:
            nord.append(1)
            centro.append(0)
            sud.append(0)
        elif i in centro_regions:
            nord.append(0)
            centro.append(1)
            sud.append(0)    
        elif i in sud_regions:
            nord.append(0)
            centro.append(0)
            sud.append(1)
        else:
            nord.append(np.nan)
            centro.append(np.nan)
            sud.append(np.nan)

    no = pd.DataFrame({'North_Region' : nord})
    ce = pd.DataFrame({'Middle_Region' : centro})
    su = pd.DataFrame({'South_Region' : sud})

    data = data.join(no)
    data = data.join(ce)
    data = data.join(su)
    return data

In [79]:
trainset = ohe_region(trainset)
testset = ohe_region(testset)
trainset.Region.head(),testset.Region.head()

(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    12.0
 1     3.0
 2    15.0
 3     0.0
 4     7.0
 Name: Region, dtype: float64)

In [80]:
print(trainset.shape)
print(trainset['North_Region'].describe())

(9567, 38)
count    9567.000000
mean        0.478102
std         0.499546
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: North_Region, dtype: float64


In [81]:
print(testset.shape)
print(testset['North_Region'].describe())

(3190, 37)
count    3190.000000
mean        0.483699
std         0.499813
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: North_Region, dtype: float64


## Provinces

In [82]:
trainset = trainset.sort_values(by='Region')
trainset['Province'] = trainset['Province'].fillna(method='ffill')
testset = testset.sort_values(by='Region')
testset['Province'] = testset['Province'].fillna(method='ffill')
trainset['Province'].head(5),testset['Province'].head(5)

(6528    93.0
 2817    71.0
 660     42.0
 159     42.0
 5469    93.0
 Name: Province, dtype: float64, 2825    71.0
 1527    42.0
 2090    42.0
 138     24.0
 366     42.0
 Name: Province, dtype: float64)

In [83]:
print(trainset.shape)
print(trainset['Province'].describe())

(9567, 38)
count    9567.000000
mean       56.406606
std        31.561575
min         0.000000
25%        26.000000
50%        58.000000
75%        84.000000
max       109.000000
Name: Province, dtype: float64


In [84]:
print(testset.shape)
print(testset['Province'].describe())

(3190, 37)
count    3190.000000
mean       56.869906
std        31.473508
min         0.000000
25%        27.000000
50%        58.000000
75%        84.000000
max       109.000000
Name: Province, dtype: float64


## CustomerAge

In [87]:
#trainset = fill_nan('CustomerAge', trainset)
testset = test_data
testset = fill_nan('CustomerAge', testset)
trainset.CustomerAge.head(), testset.CustomerAge.shape

8
              ID
CustomerAge     
15.0         0.0
25.0         0.0
35.0         0.0
45.0         0.0
55.0         0.0
65.0         0.0
75.0         0.0
85.0         0.0
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.

[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]
[0.]


(6528    35.0
 2817    55.0
 660     45.0
 159      NaN
 5469    65.0
 Name: CustomerAge, dtype: float64, (3190,))

In [88]:
test_data.CustomerAge.isnull().sum()

0

In [34]:
print(trainset.shape)
print(trainset['CustomerAge'].describe())

(9567, 38)
count    9567.000000
mean       44.799310
std        14.600451
min        15.000000
25%        35.000000
50%        45.000000
75%        55.000000
max        85.000000
Name: CustomerAge, dtype: float64


In [35]:
print(testset.shape)
print(testset['CustomerAge'].describe())

(3190, 37)
count    3190.000000
mean       44.432602
std        14.443508
min        15.000000
25%        35.000000
50%        45.000000
75%        55.000000
max        85.000000
Name: CustomerAge, dtype: float64


In [28]:
def ohe_customer_age(datap):
    data = datap
    age_ohe = data['CustomerAge']

    young_customer = [5,15,25]
    young = []
    adult_customer = [35,45,55]
    adult = []
    old_customer = [65,75,85]
    old = []

    for i in age_ohe.values:
        if i in young_customer:
            young.append(1)
            adult.append(0)
            old.append(0)
        elif i in adult_customer:
            young.append(0)
            adult.append(1)
            old.append(0)    
        elif i in old_customer:
            young.append(0)
            adult.append(0)
            old.append(1)
        else:
            young.append(np.nan)
            adult.append(np.nan)
            old.append(np.nan)

    yo = pd.DataFrame({'Young_Customer' : young})
    ad = pd.DataFrame({'Adult_Customer' : adult})
    ol = pd.DataFrame({'Old_Customer' : old})

    data = data.join(yo)
    data = data.join(ad)
    data = data.join(ol)
    return data

In [29]:
trainset = ohe_customer_age(trainset)
testset = ohe_customer_age(testset)
trainset.Region.head(),testset.Region.head()

(6528    0.0
 2817    0.0
 660     0.0
 159     0.0
 5469    0.0
 Name: Region, dtype: float64, 2825    0.0
 1527    0.0
 2090    0.0
 138     0.0
 366     0.0
 Name: Region, dtype: float64)

In [30]:
print(trainset.shape)
print(trainset['Old_Customer'].describe())

(9567, 41)
count    9567.000000
mean        0.150308
std         0.357392
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Old_Customer, dtype: float64


In [31]:
print(testset.shape)
print(testset['Old_Customer'].describe())

(3190, 40)
count    3190.000000
mean        0.139185
std         0.346194
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Old_Customer, dtype: float64


## MonthlyVoiceTrafficCount - MonthlySMSTrafficCount - MonthlyDataTraffic

In [32]:
trainset = trainset.fillna(trainset.mean())
testset = testset.fillna(testset.mean())

## Export .csv document

In [33]:
trainset.to_csv('train_rodolfo.csv', index = False, encoding='utf-8')
testset.to_csv('test_rodolfo.csv', index = False, encoding='utf-8')

In [34]:
trainset.shape

(9567, 41)

In [35]:
testset.shape

(3190, 40)