In [1]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Import the train set and test set
train_data = pd.read_csv("polimi_dataset_challenge_train_v1.csv", delimiter=",")
test_data = pd.read_csv("polimi_dataset_challenge_test_to_predict_v1.csv", delimiter=",")

In [2]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [3]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [4]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)


    #print(provinces_sort, provinces_len)
    #print(mapping, provinces)
    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [5]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
    return dataset

In [6]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns and 'ZipCode' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
        #dataset = dataset.drop('ZipCode', axis=1)
    return dataset

In [7]:
def add_modified_sample(dataset):
    modified = []
    for i in dataset.isnull().any(axis=1):
        if(i):
            modified.append(1)
        else:
            modified.append(0)
    modified = np.asarray(modified)
    modified = pd.DataFrame({'IsModified': modified})
    dataset = dataset.join(modified)
    return dataset

In [8]:
def normalize_data_set(dataset):
    dataset = drop_useless_columns(dataset)
    dataset = add_modified_sample(dataset)
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    return dataset

In [9]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)

# CHECKPOINT

In [10]:
trainset = train_data
testset = test_data

In [72]:
import math

def fill_nan(field, datap):
    data = datap
    nnan = data.isnull()[field].sum()
    field_id = data[['ID',field]]
    field_count = field_id.groupby(field).count()
    field_per = round(field_count/(len(data[field].dropna(axis=0)))*100,5)
    field_2badd = round(field_per/100*nnan,5)
    
#     print(len(field_2badd))
#     print(field_2badd)
#     print(field_2badd.sum())
    
    le = len(data)
    ct = field_2badd.values[0]
    j = 0
    for i in range(le):
        if(math.isnan(data[field][i])):
            #print("2badd: " + str(field_2badd.index[j]))
            data[field][i] = field_2badd.index[j]
            ct -=1
            if(ct <= 0.0 and j <= len(field_2badd)):
                j += 1
                #print("ct: " + str(ct))
                ct = field_2badd.values[j]
    return data

## Regions

In [73]:
trainset = fill_nan('Region', trainset)
testset = fill_nan('Region', testset)
trainset.Region.head(),testset.Region.head()

(0    16.0
 1     0.0
 2     0.0
 3     8.0
 4    19.0
 Name: Region, dtype: float64, 0    12.0
 1     3.0
 2    15.0
 3     0.0
 4     7.0
 Name: Region, dtype: float64)

In [74]:
print(trainset.shape)
print(trainset['Region'].describe())
print(testset.shape)
print(testset['Region'].describe())

(9567, 35)
count    9567.000000
mean        9.138392
std         4.995239
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64
(3190, 34)
count    3190.000000
mean        9.228527
std         5.073446
min         0.000000
25%         6.000000
50%         8.000000
75%        13.000000
max        19.000000
Name: Region, dtype: float64


## Provinces

In [75]:
trainset = trainset.sort_values(by='Region')
trainset['Province'] = trainset['Province'].fillna(method='ffill')
testset = testset.sort_values(by='Region')
testset['Province'] = testset['Province'].fillna(method='ffill')
trainset['Province'].head(5),testset['Province'].head(5)

(5628    93.0
 6680    42.0
 3710    42.0
 8720    71.0
 5351    71.0
 Name: Province, dtype: float64, 59       NaN
 189     42.0
 2722    93.0
 3029    71.0
 2721    24.0
 Name: Province, dtype: float64)

In [76]:
print(trainset.shape)
print(trainset['Province'].describe())
print(testset.shape)
print(testset['Province'].describe())

(9567, 35)
count    9567.000000
mean       56.903104
std        31.357100
min         0.000000
25%        27.000000
50%        58.000000
75%        84.000000
max       109.000000
Name: Province, dtype: float64
(3190, 34)
count    3189.000000
mean       56.803073
std        31.752841
min         0.000000
25%        27.000000
50%        58.000000
75%        84.000000
max       109.000000
Name: Province, dtype: float64


## CustomerAge

In [77]:
trainset = fill_nan('CustomerAge', trainset)
testset = fill_nan('CustomerAge', testset)
trainset.CustomerAge.head(), testset.CustomerAge.shape

(5628    45.0
 6680    45.0
 3710    45.0
 8720    45.0
 5351    25.0
 Name: CustomerAge, dtype: float64, (3190,))

In [78]:
print(trainset.shape)
print(trainset['CustomerAge'].describe())
print(testset.shape)
print(testset['CustomerAge'].describe())

(9567, 35)
count    9567.000000
mean       44.778405
std        14.593704
min        15.000000
25%        35.000000
50%        45.000000
75%        55.000000
max        85.000000
Name: CustomerAge, dtype: float64
(3190, 34)
count    3190.000000
mean       44.398119
std        14.401887
min        15.000000
25%        35.000000
50%        45.000000
75%        55.000000
max        85.000000
Name: CustomerAge, dtype: float64


## Continuous columns

In [79]:
trainset = trainset.fillna(trainset.mean())
testset = testset.fillna(testset.mean())

# Save CSV

In [11]:
trainset.to_csv('cat_train.csv', index = False, encoding='utf-8')
testset.to_csv('cat_test.csv', index = False, encoding='utf-8')