In [1]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Import the train set and test set
train_data = pd.read_csv("trainset.csv", delimiter=",")
test_data = pd.read_csv("testset.csv", delimiter=",")

In [2]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [3]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [4]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)


    #print(provinces_sort, provinces_len)
    #print(mapping, provinces)
    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [5]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
    return dataset

In [6]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
    return dataset

In [7]:
def normalize_data_set(dataset):
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    dataset = drop_useless_columns(dataset)
    return dataset

In [8]:
# Fill the missing values with the mean for each column
def fillna_mean_value(dataset):
    # fill missing values with mean column values
    dataset.fillna(dataset.mean(), inplace=True)
    return dataset

In [9]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)

In [63]:
#train_data = fillna_mean_value(train_data)
#test_data = fillna_mean_value(test_data)

In [10]:
train_data.to_csv('normalized_train_dataset.csv', index = False, encoding='utf-8')
test_data.to_csv('normalized_test_dataset.csv', index = False, encoding='utf-8')

In [11]:
print(test_data.shape, train_data.shape)

(3190, 34) (9567, 35)


In [12]:
print(test_data.columns, train_data.columns)

Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ZipCode', 'ChurnScore',
       'AirportConnectionsDuration', 'AirportConnectionsCount',
       'StationConnectionsDuration', 'StationConnectionsCount',
       'ParkingConnectionsDuration', 'ParkingConnectionsCount',
       'File-Transfer', 'Games', 'Instant-Messaging-Applications', 'Mail',
       'Music-Streaming', 'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'CustomerAge', 'Region', 'Province'],
      dtype='object') Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ZipCode', 'Ch

In [13]:
train_data.columns

Index(['ID', 'DeviceFlag4G', 'DataArpu', 'DataAllowanceContinuous',
       'DeviceFlagSmartphone', 'MonthlyVoiceTrafficCount',
       'MonthlySMSTrafficCount', 'MonthlyDataTraffic', 'CustomerGender',
       'CustomerExpatriate', 'ZipCode', 'ChurnScore',
       'AirportConnectionsDuration', 'AirportConnectionsCount',
       'StationConnectionsDuration', 'StationConnectionsCount',
       'ParkingConnectionsDuration', 'ParkingConnectionsCount',
       'File-Transfer', 'Games', 'Instant-Messaging-Applications', 'Mail',
       'Music-Streaming', 'Network-Operation', 'P2P-Applications', 'Security',
       'Streaming-Applications', 'Terminals', 'Unclassified', 'VoIP',
       'Web-Applications', 'CustomerAge', 'Region', 'Province', 'Product'],
      dtype='object')

In [14]:
table = train_data[['Province', 'Region']]
table = table.dropna()
table.shape

(7568, 2)

In [15]:
products_sort = sorted(table['Region'].dropna().unique())
products_sort

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0]

In [16]:
train = train_data
mean_region = train['Region'].mean()
mean_province = train['Province'].mean()
mean_dataarpu = train['DataArpu'].mean()
mean_zipcode = train['ZipCode'].mean()
mean_churn = train['ChurnScore'].mean()
mean_dataallowance = train['DataAllowanceContinuous'].mean()

train['ChurnScore'] = train['ChurnScore'].fillna(mean_churn)
train['DataArpu'] = train['DataArpu'].fillna(mean_dataarpu)
train['Region'] = train['Region'].fillna(mean_region)
train['Province'] = train['Province'].fillna(mean_province)
train['ZipCode'] = train['ZipCode'].fillna(mean_zipcode)
train['DataAllowanceContinuous'] = train['DataAllowanceContinuous'].fillna(mean_dataallowance)
train.isnull().sum()

ID                                  0
DeviceFlag4G                        0
DataArpu                            0
DataAllowanceContinuous             0
DeviceFlagSmartphone                0
MonthlyVoiceTrafficCount          339
MonthlySMSTrafficCount            339
MonthlyDataTraffic                339
CustomerGender                      0
CustomerExpatriate                  0
ZipCode                             0
ChurnScore                          0
AirportConnectionsDuration          0
AirportConnectionsCount             0
StationConnectionsDuration          0
StationConnectionsCount             0
ParkingConnectionsDuration          0
ParkingConnectionsCount             0
File-Transfer                       0
Games                               0
Instant-Messaging-Applications      0
Mail                                0
Music-Streaming                     0
Network-Operation                   0
P2P-Applications                    0
Security                            0
Streaming-Ap

In [17]:
train = train_data
train = train.fillna(train.mean())
train.isnull().sum()
test = test_data
test = test.fillna(test.mean())

In [18]:
train_data.Province.describe()

count    9567.000000
mean       56.794662
std        27.887057
min         0.000000
25%        37.000000
50%        56.794662
75%        81.000000
max       109.000000
Name: Province, dtype: float64

In [19]:
train.CustomerAge

0       45.000000
1       25.000000
2       35.000000
3       55.000000
4       65.000000
5       55.000000
6       45.000000
7       25.000000
8       25.000000
9       75.000000
10      25.000000
11      44.796877
12      75.000000
13      25.000000
14      55.000000
15      45.000000
16      35.000000
17      55.000000
18      65.000000
19      65.000000
20      25.000000
21      75.000000
22      65.000000
23      25.000000
24      44.796877
25      25.000000
26      35.000000
27      45.000000
28      75.000000
29      15.000000
          ...    
9537    55.000000
9538    55.000000
9539    55.000000
9540    35.000000
9541    55.000000
9542    25.000000
9543    25.000000
9544    45.000000
9545    25.000000
9546    45.000000
9547    25.000000
9548    25.000000
9549    35.000000
9550    25.000000
9551    55.000000
9552    45.000000
9553    65.000000
9554    35.000000
9555    55.000000
9556    25.000000
9557    45.000000
9558    75.000000
9559    45.000000
9560    44.796877
9561    45

In [21]:
train.to_csv('normalized_train_dataset_mean.csv', index = False, encoding='utf-8')
test.to_csv('normalized_test_dataset_mean.csv', index = False, encoding='utf-8')

In [20]:
train.shape,test.shape

((9567, 35), (3190, 34))