In [1]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Import the train set and test set
train_data = pd.read_csv("trainset.csv", delimiter=",")
test_data = pd.read_csv("testset.csv", delimiter=",")

In [2]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [3]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [4]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)


    #print(provinces_sort, provinces_len)
    #print(mapping, provinces)
    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [5]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
    return dataset

In [6]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns and 'ZipCode' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
        dataset = dataset.drop('ZipCode', axis=1)
    return dataset

In [7]:
def add_modified_sample(dataset):
    modified = []
    for i in dataset.isnull().any(axis=1):
        if(i):
            modified.append(1)
        else:
            modified.append(0)
    modified = np.asarray(modified)
    modified = pd.DataFrame({'IsModified': modified})
    dataset = dataset.join(modified)
    return dataset

In [8]:
def normalize_data_set(dataset):
    dataset = drop_useless_columns(dataset)
    dataset = add_modified_sample(dataset)
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    return dataset

In [9]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)
train_data.head()

Unnamed: 0,ID,DeviceFlag4G,DataArpu,DataAllowanceContinuous,DeviceFlagSmartphone,MonthlyVoiceTrafficCount,MonthlySMSTrafficCount,MonthlyDataTraffic,CustomerGender,CustomerExpatriate,...,Streaming-Applications,Terminals,Unclassified,VoIP,Web-Applications,IsModified,CustomerAge,Region,Province,Product
0,10930,1.0,0.156221,0.010514,1.0,0.018229,0.001623,0.011007,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,45.0,16.0,14.0,0
1,10170,0.0,0.155086,0.054729,1.0,0.051432,0.001498,0.039382,0.0,0.0,...,0.071551,0.0,0.001896,5.8e-05,0.77541,1,25.0,,,0
2,1492,1.0,,,1.0,0.058594,0.001248,0.033031,1.0,1.0,...,0.099438,0.0,0.012055,0.003579,0.499158,1,35.0,,,0
3,7424,1.0,0.155086,0.004987,1.0,0.097005,0.000499,0.002928,0.0,0.0,...,0.041474,0.0,0.051692,0.013263,0.301528,0,55.0,8.0,15.0,2
4,4332,1.0,0.155086,0.038148,1.0,0.034505,0.000375,0.001002,1.0,0.0,...,0.005975,0.0,0.000857,0.019149,0.926208,0,65.0,19.0,103.0,0


# CHECKPOINT

In [10]:
trainset = train_data
testset = test_data

## Regions

In [11]:
import math

def fill_nan(field, data):
    nnan = data.isnull()[field].sum()
    field_id = data[['ID',field]]
    field_count = field_id.groupby(field).count()
    field_per = round(field_count/(len(data[field].dropna(axis=0)))*100)
    field_2badd = round(field_per/100*nnan)
    
    le = len(data)
    ct = field_2badd.values[0]
    j = 0
    for i in range(le):
        if(math.isnan(data[field][i])):
            data[field][i] = field_2badd.index[j]
            ct -=1
            if(ct <= 0.0 and j <= len(field_2badd)):
                j += 1
                ct = field_2badd.values[j]
    return data

In [12]:
trainset = fill_nan('Region', trainset)
testset = fill_nan('Region', trainset)
trainset.Region.head(),testset.Region.head()

(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64)

In [13]:
def ohe_region(data):
    train_ohe = data
    region_ohe = train_ohe['Region']

    nord_regions = [4,5,7,8,11,17,18,19]
    nord = []
    centro_regions = [0,6,9,10,15,16]
    centro = []
    sud_regions = [1,2,3,12,13,14]
    sud = []

    for i in region_ohe.values:
        if i in nord_regions:
            nord.append(1)
            centro.append(0)
            sud.append(0)
        elif i in centro_regions:
            nord.append(0)
            centro.append(1)
            sud.append(0)    
        elif i in sud_regions:
            nord.append(0)
            centro.append(0)
            sud.append(1)
        else:
            nord.append(np.nan)
            centro.append(np.nan)
            sud.append(np.nan)

    no = pd.DataFrame({'North_Region' : nord})
    ce = pd.DataFrame({'Middle_Region' : centro})
    su = pd.DataFrame({'South_Region' : sud})

    train_ohe = train_ohe.join(no)
    train_ohe = train_ohe.join(ce)
    train_ohe = train_ohe.join(su)

    data = train_ohe
    return data

In [14]:
trainset = ohe_region(trainset)
testset = ohe_region(testset)
trainset.Region.head(),testset.Region.head()

(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64)

## Provinces

In [15]:
trainset = trainset.sort_values(by='Region')
trainset['Province'] = trainset['Province'].fillna(method='ffill')
testset = testset.sort_values(by='Region')
testset['Province'] = testset['Province'].fillna(method='ffill')
trainset.head(5),testset.head(5)

(         ID  DeviceFlag4G  DataArpu  DataAllowanceContinuous  \
 6528  10511           0.0  0.186103                 0.010514   
 2817   4174           1.0  0.108560                 0.038148   
 660    4920           1.0       NaN                 0.013367   
 159    7007           0.0       NaN                      NaN   
 5469   4129           0.0  0.124069                 0.016219   
 
       DeviceFlagSmartphone  MonthlyVoiceTrafficCount  MonthlySMSTrafficCount  \
 6528                   1.0                  0.028646                0.000250   
 2817                   1.0                  0.133464                0.009238   
 660                    1.0                  0.037109                0.002622   
 159                    0.0                       NaN                     NaN   
 5469                   1.0                  0.028646                0.001748   
 
       MonthlyDataTraffic  CustomerGender  CustomerExpatriate      ...       \
 6528            0.006104             0.0

## CustomerAge

In [16]:
trainset = fill_nan('CustomerAge', trainset)
testset = fill_nan('CustomerAge', testset)
trainset.CustomerAge.head(), testset.CustomerAge.head()

(6528    35.0
 2817    55.0
 660     45.0
 159     25.0
 5469    65.0
 Name: CustomerAge, dtype: float64, 6528    35.0
 2817    55.0
 660     45.0
 159     25.0
 5469    65.0
 Name: CustomerAge, dtype: float64)

In [17]:
def ohe_customer_age(data):
    train_ohe = data
    age_ohe = trainset['CustomerAge']

    young_customer = [5,15,25]
    young = []
    adult_customer = [35,45,55]
    adult = []
    old_customer = [65,75,85]
    old = []

    for i in age_ohe.values:
        if i in young_customer:
            young.append(1)
            adult.append(0)
            old.append(0)
        elif i in adult_customer:
            young.append(0)
            adult.append(1)
            old.append(0)    
        elif i in old_customer:
            young.append(0)
            adult.append(0)
            old.append(1)
        else:
            young.append(np.nan)
            adult.append(np.nan)
            old.append(np.nan)

    yo = pd.DataFrame({'Young_Customer' : young})
    ad = pd.DataFrame({'Adult_Customer' : adult})
    ol = pd.DataFrame({'Old_Customer' : old})

    train_ohe = train_ohe.join(yo)
    train_ohe = train_ohe.join(ad)
    train_ohe = train_ohe.join(ol)

    data = train_ohe
    return data

In [19]:
trainset = ohe_customer_age(trainset)
testset = ohe_customer_age(testset)
trainset.Region.head(),testset.Region.head()

(6528    0.0
 2817    0.0
 660     0.0
 159     0.0
 5469    0.0
 Name: Region, dtype: float64, 6528    0.0
 2817    0.0
 660     0.0
 159     0.0
 5469    0.0
 Name: Region, dtype: float64)

## MonthlyVoiceTrafficCount - MonthlySMSTrafficCount - MonthlyDataTraffic

In [22]:
trainset = trainset.fillna(trainset.mean())
testset = trainset.fillna(testset.mean())

In [23]:
trainset.isnull().sum(), testset.isnull().sum()

(ID                                0
 DeviceFlag4G                      0
 DataArpu                          0
 DataAllowanceContinuous           0
 DeviceFlagSmartphone              0
 MonthlyVoiceTrafficCount          0
 MonthlySMSTrafficCount            0
 MonthlyDataTraffic                0
 CustomerGender                    0
 CustomerExpatriate                0
 ChurnScore                        0
 AirportConnectionsDuration        0
 AirportConnectionsCount           0
 StationConnectionsDuration        0
 StationConnectionsCount           0
 ParkingConnectionsDuration        0
 ParkingConnectionsCount           0
 File-Transfer                     0
 Games                             0
 Instant-Messaging-Applications    0
 Mail                              0
 Music-Streaming                   0
 Network-Operation                 0
 P2P-Applications                  0
 Security                          0
 Streaming-Applications            0
 Terminals                         0
 

In [24]:
trainset.to_csv('train_rodolfo.csv', index = False, encoding='utf-8')
testset.to_csv('test_rodolfo.csv', index = False, encoding='utf-8')

In [28]:
trainset.CustomerExpatriate

6528    0.0
2817    0.0
660     1.0
159     0.0
5469    0.0
5489    0.0
1470    0.0
8519    0.0
2718    0.0
5563    0.0
170     0.0
171     0.0
5628    0.0
8432    0.0
5636    0.0
5648    0.0
8401    0.0
8387    0.0
180     1.0
5674    1.0
8281    0.0
8265    0.0
188     0.0
2596    0.0
8233    0.0
193     1.0
5456    0.0
8587    0.0
154     0.0
2829    0.0
       ... 
9048    0.0
9049    0.0
9050    1.0
6835    0.0
6777    0.0
5530    0.0
9023    0.0
3250    0.0
5568    1.0
9002    0.0
646     0.0
9004    0.0
6789    0.0
3241    0.0
3680    0.0
8212    0.0
9011    0.0
3950    0.0
9013    1.0
3240    0.0
8207    0.0
3230    0.0
418     0.0
3959    0.0
3962    0.0
1666    0.0
6825    0.0
650     0.0
6207    0.0
869     0.0
Name: CustomerExpatriate, Length: 9567, dtype: float64