In [36]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Import the train set and test set
train_data = pd.read_csv("polimi_dataset_challenge_train_v1.csv", delimiter=",")
test_data = pd.read_csv("polimi_dataset_challenge_test_to_predict_v1.csv", delimiter=",")

In [38]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [39]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [40]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)


    #print(provinces_sort, provinces_len)
    #print(mapping, provinces)
    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [41]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
        print(mapping)
    return dataset

In [42]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns and 'ZipCode' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
        dataset = dataset.drop('ZipCode', axis=1)
    return dataset

In [43]:
def add_modified_sample(dataset):
    modified = []
    for i in dataset.isnull().any(axis=1):
        if(i):
            modified.append(1)
        else:
            modified.append(0)
    modified = np.asarray(modified)
    modified = pd.DataFrame({'IsModified': modified})
    dataset = dataset.join(modified)
    return dataset

In [44]:
def normalize_data_set(dataset):
    dataset = drop_useless_columns(dataset)
    dataset = merge_duration_count(dataset)
    dataset = add_modified_sample(dataset)
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    return dataset

In [45]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)
train_data.columns

{'Non-Customer': 0, 'V-Auto': 1, 'V-Bag': 2, 'V-Pet': 3}


Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ChurnScore', 'AirportConnectionsDuration',
       'AirportConnectionsCount', 'StationConnectionsDuration',
       'StationConnectionsCount', 'ParkingConnectionsDuration',
       'ParkingConnectionsCount', 'File-Transfer', 'Games',
       'Instant-Messaging-Applications', 'Mail', 'Music-Streaming',
       'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'ConnectionsDuration', 'ConnectionsCount',
       'IsModified', 'CustomerAge', 'Region', 'Province', 'Product'],
      dtype='object')

# CHECKPOINT

In [46]:
trainset = train_data
testset = test_data

## Regions

In [13]:
import math

def fill_nan(field, datap):
    data = datap
    nnan = data.isnull()[field].sum()
    field_id = data[['ID',field]]
    field_count = field_id.groupby(field).count()
    field_per = round(field_count/(len(data[field].dropna(axis=0)))*100,5)
    field_2badd = round(field_per/100*nnan,5)
    
#     print(len(field_2badd))
#     print(field_2badd)
#     print(field_2badd.sum())
    
    le = len(data)
    ct = field_2badd.values[0]
    j = 0
    for i in range(le):
        if(math.isnan(data[field][i])):
            #print("2badd: " + str(field_2badd.index[j]))
            data[field][i] = field_2badd.index[j]
            ct -=1
            if(ct <= 0.0 and j < len(field_2badd)):
                j += 1
                #print("ct: " + str(ct))
                ct = field_2badd.values[j]
    return data

In [14]:
trainset = fill_nan('Region', trainset)
testset = fill_nan('Region', testset)
trainset.Region.head(),testset.Region.head()

(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    12.0
 1     3.0
 2    15.0
 3     0.0
 4     7.0
 Name: Region, dtype: float64)

In [15]:
print(trainset.shape)
print(trainset['Region'].describe())

(9567, 35)
count    9567.000000
mean        9.138392
std         4.995239
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64


In [16]:
print(testset.shape)
print(testset['Region'].describe())

(3190, 34)
count    3190.000000
mean        9.228527
std         5.073446
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64


In [17]:
def ohe_region(datap):
    data = datap
    region_ohe = data['Region']

    north_regions = [4,5,7,8,11,17,18,19]
    middle_regions = [0,6,9,10,15,16]
    suoth_regions = [1,2,3,12,13,14]
    regions_cluster = []

    for i in region_ohe.values:
        if i in nord_regions:
            regions_cluster.append(0)
        elif i in centro_regions:
            regions_cluster.append(1)   
        elif i in sud_regions:
            regions_cluster.append(2)
        else:
            regions_cluster.append(np.nan)
    
    regions_cluster = np.asarray(regions_cluster)
    regions_cluster = pd.DataFrame({'Regions_Cluster' : regions_cluster})

    data = data.join(regions_cluster)
    return data

In [18]:
trainset = ohe_region(trainset)
testset = ohe_region(testset)
trainset.Region.head(),testset.Region.head()

(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    12.0
 1     3.0
 2    15.0
 3     0.0
 4     7.0
 Name: Region, dtype: float64)

In [19]:
print(trainset.shape)
print(trainset['North_Region'].describe())

(9567, 38)
count    9567.000000
mean        0.477057
std         0.499499
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: North_Region, dtype: float64


In [20]:
print(testset.shape)
print(testset['North_Region'].describe())

(3190, 37)
count    3190.000000
mean        0.483699
std         0.499813
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: North_Region, dtype: float64


## Provinces

In [21]:
trainset = trainset.sort_values(by='Region')
trainset['Province'] = trainset['Province'].fillna(method='ffill')
testset = testset.sort_values(by='Region')
testset['Province'] = testset['Province'].fillna(method='ffill')
trainset['Province'].head(5),testset['Province'].head(5)

(5628    93.0
 6680    42.0
 3710    42.0
 8720    71.0
 5351    71.0
 Name: Province, dtype: float64, 59       NaN
 189     42.0
 2722    93.0
 3029    71.0
 2721    24.0
 Name: Province, dtype: float64)

In [22]:
print(trainset.shape)
print(trainset['Province'].describe())

(9567, 38)
count    9567.000000
mean       56.903104
std        31.357100
min         0.000000
25%        27.000000
50%        58.000000
75%        84.000000
max       109.000000
Name: Province, dtype: float64


In [23]:
print(testset.shape)
print(testset['Province'].describe())

(3190, 37)
count    3189.000000
mean       56.803073
std        31.752841
min         0.000000
25%        27.000000
50%        58.000000
75%        84.000000
max       109.000000
Name: Province, dtype: float64


## CustomerAge

In [24]:
trainset = fill_nan('CustomerAge', trainset)

# test_data = normalize_data_set(pd.read_csv("testset.csv", delimiter=","))
# testset = test_data
testset = fill_nan('CustomerAge', testset)
trainset.CustomerAge.head(), testset.CustomerAge.shape

(5628    45.0
 6680    45.0
 3710    45.0
 8720    45.0
 5351    25.0
 Name: CustomerAge, dtype: float64, (3190,))

In [25]:
testset.isnull().sum()

ID                                  0
DeviceFlag4G                        0
DataArpu                          757
DataAllowanceContinuous           388
DeviceFlagSmartphone                0
MonthlyVoiceTrafficCount          110
MonthlySMSTrafficCount            110
MonthlyDataTraffic                110
CustomerGender                      0
CustomerExpatriate                  0
ChurnScore                        303
AirportConnectionsDuration          0
AirportConnectionsCount             0
StationConnectionsDuration          0
StationConnectionsCount             0
ParkingConnectionsDuration          0
ParkingConnectionsCount             0
File-Transfer                       0
Games                               0
Instant-Messaging-Applications      0
Mail                                0
Music-Streaming                     0
Network-Operation                   0
P2P-Applications                    0
Security                            0
Streaming-Applications              0
Terminals   

In [26]:
print(trainset.shape)
print(trainset['CustomerAge'].describe())

(9567, 38)
count    9567.000000
mean       44.778405
std        14.593704
min        15.000000
25%        35.000000
50%        45.000000
75%        55.000000
max        85.000000
Name: CustomerAge, dtype: float64


In [27]:
print(testset.shape)
print(testset['CustomerAge'].describe())

(3190, 37)
count    3190.000000
mean       44.398119
std        14.401887
min        15.000000
25%        35.000000
50%        45.000000
75%        55.000000
max        85.000000
Name: CustomerAge, dtype: float64


In [28]:
def ohe_customer_age(datap):
    data = datap
    age_ohe = data['CustomerAge']

    young_customer = [5,15,25]
    young = []
    adult_customer = [35,45,55]
    adult = []
    old_customer = [65,75,85]
    old = []

    for i in age_ohe.values:
        if i in young_customer:
            young.append(1)
            adult.append(0)
            old.append(0)
        elif i in adult_customer:
            young.append(0)
            adult.append(1)
            old.append(0)    
        elif i in old_customer:
            young.append(0)
            adult.append(0)
            old.append(1)
        else:
            young.append(np.nan)
            adult.append(np.nan)
            old.append(np.nan)

    yo = pd.DataFrame({'Young_Customer' : young})
    ad = pd.DataFrame({'Adult_Customer' : adult})
    ol = pd.DataFrame({'Old_Customer' : old})

    data = data.join(yo)
    data = data.join(ad)
    data = data.join(ol)
    return data

In [29]:
trainset = ohe_customer_age(trainset)
testset = ohe_customer_age(testset)
trainset.CustomerAge.head(),testset.CustomerAge.head()

(5628    45.0
 6680    45.0
 3710    45.0
 8720    45.0
 5351    25.0
 Name: CustomerAge, dtype: float64, 59      65.0
 189     55.0
 2722    25.0
 3029    55.0
 2721    25.0
 Name: CustomerAge, dtype: float64)

In [30]:
print(trainset.shape)
print(trainset['Old_Customer'].describe())

(9567, 41)
count    9567.000000
mean        0.149890
std         0.356982
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Old_Customer, dtype: float64


In [31]:
print(testset.shape)
print(testset['Old_Customer'].describe())

(3190, 40)
count    3190.000000
mean        0.141693
std         0.348789
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Old_Customer, dtype: float64


## MonthlyVoiceTrafficCount - MonthlySMSTrafficCount - MonthlyDataTraffic

In [32]:
trainset = trainset.fillna(trainset.mean())
testset = testset.fillna(testset.mean())

## Export .csv document

In [33]:
trainset.to_csv('train_rodolfo.csv', index = False, encoding='utf-8')
testset.to_csv('test_rodolfo.csv', index = False, encoding='utf-8')

In [34]:
trainset.shape

(9567, 41)

In [35]:
testset.shape

(3190, 40)

In [36]:
trainset.North_Region.sum()+trainset.Middle_Region.sum()+trainset.South_Region.sum()

9567