In [35]:
import pandas as pd
import numpy as np

# Random seed for reproducibility
seed = 202
np.random.seed(seed)

# Ignore warnings
import warnings
warnings.simplefilter('ignore')

# Import the train set and test set
train_data = pd.read_csv("polimi_dataset_challenge_train_v1.csv", delimiter=",")
test_data = pd.read_csv("polimi_dataset_challenge_test_to_predict_v1.csv", delimiter=",")

In [36]:
def merge_duration_count(dataset):
    duration = dataset['AirportConnectionsDuration']+dataset['StationConnectionsDuration']+dataset['ParkingConnectionsDuration']
    duration = np.asarray(duration/3)
    duration = pd.DataFrame({'ConnectionsDuration': duration})
    count = dataset['AirportConnectionsCount']+dataset['StationConnectionsCount']+dataset['ParkingConnectionsCount']
    count = np.asarray(count/3)
    count = pd.DataFrame({'ConnectionsCount': count})
    
    dataset = dataset.join(duration)
    dataset = dataset.join(count)
    return dataset

In [37]:
def normalize_costumer_age(dataset):
    #Take all unique Cosumer_Age, assigns to each age alphabetically ordered a mean age
    customer_age_sort = sorted(dataset['CustomerAge'].dropna().unique())

    mapping = {}
    for idx, val in enumerate(customer_age_sort):
        mapping[val] = int(15+idx*10)
        
    ages = dataset['CustomerAge']
    ages.replace(mapping, inplace=True)

    #print(mapping, regions)
    ages = np.asarray(ages)
    ages = pd.DataFrame({'CustomerAge': ages})
    
     # Remove the previous CustomerAge and then insert the new CustomerAge
    dataset = dataset.drop('CustomerAge', axis=1)
    dataset = dataset.join(ages)
    return dataset

In [38]:
def normalize_region(dataset):
    #Take all unique Regions, assigns to each region alphabetically ordered a monotonic growing number
    regions_sort = sorted(dataset['Region'].dropna().unique())
    #regions_len = len(regions_sort) #20 Regions

    mapping = {}
    for idx, val in enumerate(regions_sort):
        mapping[val] = int(idx)

    regions = dataset['Region']
    regions.replace(mapping, inplace=True)

    #print(mapping, regions)
    regions = np.asarray(regions)
    regions = pd.DataFrame({'Region': regions})
    
    # Remove the previous Region columns and then insert the new Region
    dataset = dataset.drop('Region', axis=1)
    dataset = dataset.join(regions)
    return dataset

In [39]:
def create_regions_cluster(data):
    
    region_ohe = data['Region']

    north_regions = [4,5,7,8,11,17,18,19]
    middle_regions = [0,6,9,10,15,16]
    south_regions = [1,2,3,12,13,14]
    
    regions_cluster = []

    for i in region_ohe.values:
        if i in north_regions:
            regions_cluster.append(0)
        elif i in middle_regions:
            regions_cluster.append(1)   
        elif i in south_regions:
            regions_cluster.append(2)
        else:
            regions_cluster.append(np.nan)
    
    regions_cluster = np.asarray(regions_cluster)
    regions_cluster = pd.DataFrame({'Regions_Cluster' : regions_cluster})

    data = data.join(regions_cluster)
    return data

In [40]:
def normalize_province(dataset):
    #Take all unique Provinces, assigns to each province alphabetically ordered a monotonic growing number
    provinces_sort = sorted(dataset['Province'].dropna().unique())
    #provinces_len = len(provinces_sort) #110 Provinces

    mapping = {}
    for idx, val in enumerate(provinces_sort):
        mapping[val] = idx

    provinces = dataset['Province']
    provinces.replace(mapping, inplace=True)

    provinces = np.asarray(provinces)
    provinces = pd.DataFrame({'Province': provinces})
    
    # Remove the previous Province columns and then insert the new Province
    dataset = dataset.drop('Province', axis=1)
    dataset = dataset.join(provinces)
    return dataset

In [41]:
def normalize_product(dataset):
    if('Product' in dataset.columns):
        #Take all unique Products, assigns to each product alphabetically ordered a monotonic growing number
        products_sort = sorted(dataset['Product'].dropna().unique())

        mapping = {}
        for idx, val in enumerate(products_sort):
            mapping[val] = int(idx)

        products = dataset['Product']
        products.replace(mapping, inplace=True)

        #print(mapping, products)
        products = np.asarray(products)
        products = pd.DataFrame({'Product': products})

        # Remove the previous Product columns and then insert the newProduct
        dataset = dataset.drop('Product', axis=1)
        dataset = dataset.join(products)
    return dataset

In [42]:
def drop_useless_columns(dataset):
    if('DataAllowanceOneShot' in dataset.columns and 'EstimatedDevicePrice' in dataset.columns and 'ZipCode' in dataset.columns):
        dataset = dataset.drop('DataAllowanceOneShot', axis=1)
        dataset = dataset.drop('EstimatedDevicePrice', axis=1)
    return dataset

In [43]:
def add_modified_sample(dataset):
    modified = []
    for i in dataset.isnull().any(axis=1):
        if(i):
            modified.append(1)
        else:
            modified.append(0)
    modified = np.asarray(modified)
    modified = pd.DataFrame({'IsModified': modified})
    dataset = dataset.join(modified)
    return dataset

In [44]:
def normalize_data_set(dataset):
    dataset = drop_useless_columns(dataset)
    #dataset = merge_duration_count(dataset)
    dataset = add_modified_sample(dataset)
    dataset = normalize_costumer_age(dataset)
    dataset = normalize_region(dataset)
    dataset = create_regions_cluster(dataset)
    dataset = normalize_province(dataset)
    dataset = normalize_product(dataset)
    return dataset

In [45]:
train_data = normalize_data_set(train_data)
test_data = normalize_data_set(test_data)

# CHECKPOINT

In [47]:
trainset = train_data
testset = test_data

In [48]:
#trainset = trainset.fillna(trainset.mean())
#testset = testset.fillna(testset.mean())

# Save CSV

In [49]:
trainset.to_csv('cat_train.csv', index = False, encoding='utf-8')
testset.to_csv('cat_test.csv', index = False, encoding='utf-8')