In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import charset_normalizer
import re

import warnings
warnings.filterwarnings("ignore")


# Read File

In [40]:
# Training Set
context = pd.read_csv('../initial_data/train/context.csv')
mpa = pd.read_csv('../initial_data/train/mobile_plan_attr.csv', sep=';')
mpu = pd.read_csv('../initial_data/train/mobile_plan_user.csv')
user = pd.read_csv('../initial_data/train/user.csv')
train_set = {'user': user,'context': context, 'mpa': mpa, 'mpu': mpu }


In [41]:
# Testing Set
context_test = pd.read_csv('../initial_data/test/context.csv')
mpa_test = pd.read_csv('../initial_data/test/mobile_plan_attr.csv', sep=';')
mpu_test = pd.read_csv('../initial_data/test/mobile_plan_user.csv')
user_test = pd.read_csv('../initial_data/test/user.csv')
test_set = {'user': user,'context': context, 'mpa': mpa, 'mpu': mpu }

# Preprocessing

## Missing Value

In [42]:
# Training set
for key, value in train_set.items():
    length = len(value)
    missing_values = value.isnull().sum()  
    missing_values = missing_values[missing_values > 0]  

    if not missing_values.empty:
        missing_df = pd.DataFrame({'count': missing_values, 'percentage': missing_values / length*100})
        
        print(f'------------------ {key} ----------------------')
        print(missing_df)


------------------ user ----------------------
           count  percentage
education   4003   34.592119
------------------ mpu ----------------------
             count  percentage
mobile_plan   2715    5.990600
accept        2721    6.003839


In [43]:
# Remove missing values in training data
mpu = mpu.dropna(subset = ['mobile_plan','accept'])

# Remove missing values in testing data
mpu_test = mpu_test.rename(columns={'coupon_id': 'mobile_plan'})
mpu_test = mpu_test.dropna(subset=['mobile_plan', 'accept'])


## User

In [44]:
user.head()

Unnamed: 0,id,name,gender,age,education,profession,income,living_with,nation,phone,job,fb_freq,yt_freq,insta_freq,use_less_than_2GB,use_2GB_to_4GB
0,11156,Rachel Gibbs,Female,21,,Unemployed,39100$,Unmarrie d_2,AUSTRALIA,(08)-8012-7556,Astronomer,0,0,9,8,3
1,4297,Karen Anderson,Other,22,,Unemployed,41000$,U nmarr ied_2,ENGLAND,943-646-5203,Air cabin crew,0,0,23,7,2
2,13301,김지원,Female,24,,Unemployed,44300$,Unma rried_1,KOREA,010-4500-9888,기계공학 기술자 및 연구원,0,0,23,5,2
3,9920,Elisabeth Wähner,Female,24,,Unemployed,44400$,Un ma rried_1,DENMARK,+49(0) 587406963,Medizininformatiker,0,0,21,7,2
4,8424,Sra. Maria Luiza Nogueira,Female,25,,Unemployed,1100000000 VND,U nmarr ied_1,BRAZIL,(084) 0568 1445,Ufólogo,0,0,22,5,3


### Name

In [45]:
# Training_set
user = user.drop('name',axis =1)
# Testing_set
user_test = user_test.drop('name',axis =1)

### Gender

In [46]:
user['gender'] = user['gender'].str.lower()
user_test['gender'] = user_test['gender'].str.lower()

### Living With

In [47]:
# Training_set
user['num_child'] = user['living_with'].apply(lambda x:re.sub(r'[a-zA-Z_]+', '', x))
user["living_with"] = user["living_with"].apply(lambda x:re.sub(r'[ _0-9]+', '', x).lower())


# Testing_set
user_test['num_child'] = user_test['living_with'].apply(lambda x:re.sub(r'[a-zA-Z_]+', '', x))
user_test["living_with"] = user_test["living_with"].apply(lambda x:re.sub(r'[ _0-9]+', '', x).lower())

In [48]:
user['living_with'].value_counts()

living_with
married      4524
single       4448
unmarried    2093
divorced      398
widowed       109
Name: count, dtype: int64

In [49]:
user['living_with'] = user['living_with'].apply(lambda x:'married' if x == 'married' else 'single')
user_test['living_with'] = user['living_with'].apply(lambda x:'married' if x == 'married' else 'single')
user = user.rename(columns={'living_with': 'marital_status'})
user_test = user_test.rename(columns={'living_with': 'marital_status'})



### Education

In [50]:
# Training_set
user['education'] = user['education'].fillna('unknown')
user['education'] = user['education'].str.lower()

# Testing_set
user_test['education'] = user_test['education'].fillna('unknown')
user_test['education'] = user_test['education'].str.lower()



In [51]:
user['education'].value_counts()

education
bachelor      4007
unknown       4003
masters       1613
associate     1038
highschool     911
Name: count, dtype: int64

In [52]:
user['education'] = user['education'].apply(lambda x: 'undergrad' if x in ['highschool', 'associate'] else 'postgrad' if x == 'masters' else 'unknown' if x =='unknown' else 'grad')
user_test['education'] = user_test['education'].apply(lambda x: 'undergrad' if x in ['highschool', 'associate'] else 'postgrad' if x == 'masters' else 'unknown' if x =='unknown' else 'grad')


### Profession

In [53]:
user['profession'].value_counts()

profession
Unemployed                                   1699
Student                                      1554
Computer & Mathematical                      1303
Sales & Related                              1010
Education&Training&Library                    834
Management                                    797
Office & Administrative Support               573
Arts Design Entertainment Sports & Media      522
Business & Financial                          494
Retired                                       438
Food Preparation & Serving Related            256
Community & Social Services                   220
Healthcare Support                            211
Healthcare Practitioners & Technical          204
Transportation & Material Moving              193
Legal                                         179
Architecture & Engineering                    174
Protective Service                            160
Personal Care & Service                       152
Life Physical Social Science           

In [54]:
# Training_set
user['profession'] = user['profession'].str.lower()
user['profession'] = user['profession'].apply(lambda x: x if x in ['unemployed','retired','student'] else 'employed')

# Testing_set
user_test['profession'] = user_test['profession'].str.lower()
user_test['profession'] = user_test['profession'].apply(lambda x: x if x in ['unemployed','retired','student'] else 'employed')

### Income

In [55]:
def income_convert(x):
    if 'VND' in x:
        x = int(x.replace('VND','')) 
        x /= 25000
    elif '$' in x:
        x = int(x.replace('$',''))
    return x


In [56]:
# Training_set
user['income'] = user['income'].apply(lambda x:income_convert(x))
user['income'] = user['income'].apply(lambda x: 'lower' if x <= 30000 else 'lower-middle' if x > 30000 and x <= 58000 else 'middle' if x > 58000 and x <= 94000 else 'upper-middle' if x > 94000 and x <= 153000 else 'upper' if x > 153000 and x <= 200000 else 'extreme upper')

#Testing_set
user_test['income'] = user_test['income'].apply(lambda x:income_convert(x))
user_test['income'] = user_test['income'].apply(lambda x: 'lower' if x <= 30000 else 'lower-middle' if x > 30000 and x <= 58000 else 'middle' if x > 58000 and x <= 94000 else 'upper-middle' if x > 94000 and x <= 153000 else 'upper' if x > 153000 and x <= 200000 else 'extreme upper')


### Nation

In [57]:
user['nation'].value_counts()

nation
US           2980
ENGLAND      2934
CHINA         955
JAPAN         953
RUSSIA        943
BRAZIL        729
AUSTRALIA     718
KOREA         707
DENMARK       653
Name: count, dtype: int64

In [58]:
continent_map = {
    'US': 'america',
    'ENGLAND': 'europe',
    'CHINA': 'asia',
    'JAPAN': 'asia',
    'RUSSIA': 'europe',
    'BRAZIL': 'america',
    'AUSTRALIA': 'oceania',
    'KOREA': 'asia',
    'DENMARK': 'europe'
}

# Rename column
user = user.rename(columns={'nation': 'continent'})
user_test = user_test.rename(columns={'nation': 'continent'})

# Map values
user['continent'] = user['continent'].map(continent_map)
user_test['continent'] = user_test['continent'].map(continent_map)


### Job, Phone

In [59]:
user = user.drop(['job','phone'],axis =1)
user_test = user_test.drop(['job','phone'],axis =1)

In [60]:
user.head()

Unnamed: 0,id,gender,age,education,profession,income,marital_status,continent,fb_freq,yt_freq,insta_freq,use_less_than_2GB,use_2GB_to_4GB,num_child
0,11156,female,21,unknown,unemployed,lower-middle,single,oceania,0,0,9,8,3,2
1,4297,other,22,unknown,unemployed,lower-middle,single,europe,0,0,23,7,2,2
2,13301,female,24,unknown,unemployed,lower-middle,single,asia,0,0,23,5,2,1
3,9920,female,24,unknown,unemployed,lower-middle,single,europe,0,0,21,7,2,1
4,8424,female,25,unknown,unemployed,lower-middle,single,america,0,0,22,5,3,1


In [61]:
user_test.head()

Unnamed: 0,id,gender,age,education,profession,income,marital_status,continent,fb_freq,yt_freq,insta_freq,use_less_than_2GB,use_2GB_to_4GB,num_child
0,7067,female,33,postgrad,unemployed,middle,single,asia,0,1,3,0,1,1
1,2114,female,34,postgrad,unemployed,middle,single,asia,0,1,1,1,1,1
2,6582,female,34,postgrad,unemployed,middle,single,oceania,0,1,3,1,0,3
3,11379,female,34,postgrad,unemployed,middle,single,america,0,1,3,0,0,4
4,3986,female,34,postgrad,unemployed,middle,single,asia,0,1,1,0,1,1


## Context

In [62]:
context.head()

Unnamed: 0,id,purpose,go_with,weather,time,viettel_no_0,viettel_no_1,viettel_no_2,to_hanoi,to_other,score,direction
0,11156,Travel,Al?!%one,&&!!Sunny,13:00,1,0,0,0,1,0,0
1,4297,Travel,F!ri?end%(s),Sun!~ny,9 AM,1,1,0,0,1,0,0
2,13301,Travel,Frie~nd(s),!&~S!unny,15:00,1,1,0,0,1,0,0
3,9920,Travel,?Friend(s),Su%n&ny,2 PM,1,1,0,0,1,30,0
4,8424,Travel,?Fa! mily,Sun%ny,10AM,1,1,0,0,1,0,0


### Go_with and weather

In [63]:
for x in ["go_with", "weather"]:
    context[x] = context[x].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", str(x)).replace(' ','').lower())
    context_test[x] = context_test[x].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", str(x)).replace(' ','').lower())


### Time

In [64]:
def convert_time(x):
    if 'AM' in x:
        x = x.replace('AM','')
    elif 'PM' in x:
        x = x.replace('PM','')
        x = int(x.split(':')[0]) + 12
    elif ':00' in x:
        x = x.replace(':00','')
    return x

def time_of_day(x):
    hour = int(x)
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

In [65]:
context['time'] = context['time'].apply(lambda x: convert_time(x))
context['time'] = context['time'].apply(time_of_day)

context_test['time'] = context_test['time'].apply(lambda x: convert_time(x))
context_test['time'] = context_test['time'].apply(time_of_day)


### Purpose

In [66]:
context["purpose"] = context["purpose"].str.strip()
context["purpose"] = context["purpose"].str.lower()
context["purpose"] = context["purpose"].apply(lambda x: x if x == 'work'else 'travel')

context_test["purpose"] = context_test["purpose"].str.strip()
context_test["purpose"] = context_test["purpose"].str.lower()
context_test["purpose"] = context_test["purpose"].apply(lambda x: x if x == 'work'else 'travel')


### to_hanoi, to_other, direction

In [67]:
context = context.drop(axis=1, columns=["to_hanoi", "to_other"])
context_test = context_test.drop(axis=1, columns=["to_hanoi", "to_other"])

In [68]:
context.head()

Unnamed: 0,id,purpose,go_with,weather,time,viettel_no_0,viettel_no_1,viettel_no_2,score,direction
0,11156,travel,alone,sunny,afternoon,1,0,0,0,0
1,4297,travel,friends,sunny,morning,1,1,0,0,0
2,13301,travel,friends,sunny,afternoon,1,1,0,0,0
3,9920,travel,friends,sunny,afternoon,1,1,0,30,0
4,8424,travel,family,sunny,morning,1,1,0,0,0


## Mobile Plan Attributes

In [69]:
mpa.head()

Unnamed: 0,mobile_plan,description,price,duration
0,DATASILVER,"2GB/ day, high speed",100000,5d
1,DATAGOLD,"5GB/ day, high speed",200000,5d
2,SOCIALMEDIA,"1GB/ dayUnlimited for Tik Tok, Facebook, Youtube",150000,3d
3,SOCIALMEDIAGOLD,"3GB/ day, high speedUnlimited for Tik Tok, Fac...",250000,3d
4,DATACALL,"2GB/ day, high speed300 mins call for external...",200000,5d


### Mobile Plan

In [70]:
mpa['mobile_plan'] = mpa['mobile_plan'].str.lower()
mpa_test['mobile_plan'] = mpa_test['mobile_plan'].str.lower()

### Duration

In [71]:
mpa['duration'] = mpa['duration'].str.replace('d','')
mpa_test['duration'] = mpa_test['duration'].str.replace('d','')

###  Data Capacity

In [72]:
mpa['capacity'] = mpa['description'].str.extract(r'(\d+)GB')
mpa_test['capacity'] = mpa_test['description'].str.extract(r'(\d+)GB')

### Descripton

In [73]:
def extract_ad(x):
    temp = ''
    if 'high speed' in x:
        temp += 'High_Speed'
    if 'Unlimited' in x:
        if temp != '':
            temp += ', '
        temp+= 'Unlimited_Social'
    if 'call' in x: 
        if temp != '':
            temp += ', '
        temp+= 'Calls'
    return temp
    

mpa['description'] = mpa['description'].apply(lambda x: extract_ad(x))
mpa_test['description'] = mpa_test['description'].apply(lambda x: extract_ad(x))



In [74]:
mpa

Unnamed: 0,mobile_plan,description,price,duration,capacity
0,datasilver,High_Speed,100000,5,2
1,datagold,High_Speed,200000,5,5
2,socialmedia,Unlimited_Social,150000,3,1
3,socialmediagold,"High_Speed, Unlimited_Social",250000,3,3
4,datacall,"High_Speed, Unlimited_Social, Calls",200000,5,2


## Mobile Plan User

In [75]:
mpu

Unnamed: 0,id,mobile_plan,accept
0,11156.0,DATASILVER,1.0
1,4297.0,SOCIALMEDIAGOLD,1.0
2,13301.0,DATASILVER,1.0
3,9920.0,SOCIALMEDIAGOLD,1.0
4,8424.0,DATASILVER,1.0
...,...,...,...
45312,14062.0,DATAGOLD,0.0
45314,13710.0,DATAGOLD,0.0
45315,10956.0,SOCIALMEDIA,0.0
45316,14977.0,DATASILVER,0.0


In [76]:
mpu['mobile_plan'] = mpu['mobile_plan'].str.lower()
mpu_test['mobile_plan'] = mpu_test['mobile_plan'].str.lower()