In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
# featuretools for automated feature engineering
import featuretools as ft
import featuretools.variable_types as vtypes

In [6]:
arpu_train =  pd.read_csv('data/arpu_train.csv')
arpu_test =  pd.read_csv('data/arpu_test.csv')

arpu_train_info = pd.read_csv('data/arpu_info_train.csv').replace({365243: np.nan})
arpu_test_info = pd.read_csv('data/arpu_info_test.csv').replace({365243: np.nan})

arpu_miseri_train = pd.read_csv('data/arpu_miseri_train.csv').replace({365243: np.nan})
arpu_miseri_test = pd.read_csv('data/arpu_miseri_test.csv').replace({365243: np.nan})

In [7]:
es1 = ft.EntitySet(id = 'es1')
es2 = ft.EntitySet(id = 'es2')
es1 = es1.entity_from_dataframe(entity_id = 'arpu_miseri_train', dataframe = arpu_miseri_train, 
                              make_index = True, index = 'miseri_index_train')
es2 = es2.entity_from_dataframe(entity_id = 'arpu_miseri_test', dataframe = arpu_miseri_test, 
                              make_index = True, index = 'miseri_index_test')

In [8]:
# List the primitives in a dataframe
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100

# primitives[primitives['type'] == 'aggregation']
primitives[primitives['type'] == 'transform']


Unnamed: 0,name,type,dask_compatible,koalas_compatible,description
22,and,transform,True,True,Element-wise logical AND of two lists.
23,modulo_numeric_scalar,transform,True,True,Return the modulo of each element in the list by a scalar.
24,longitude,transform,False,False,Returns the second tuple value in a list of LatLong tuples.
25,cum_mean,transform,False,False,Calculates the cumulative mean.
26,multiply_boolean,transform,True,False,Element-wise multiplication of two lists of boolean values.
27,add_numeric_scalar,transform,True,True,Add a scalar to each value in the list.
28,greater_than_equal_to,transform,True,True,Determines if values in one list are greater than or equal to another list.
29,week,transform,True,True,Determines the week of the year from a datetime.
30,hour,transform,True,True,Determines the hour value of a datetime.
31,isin,transform,True,True,Determines whether a value is present in a provided list.


In [9]:
# Default primitives from featuretools
agg_primitives =  []
trans_primitives = ['cum_mean','is_null']

# DFS with specified primitives
feature_matrix1, feature_names1 = ft.dfs(entityset=es1, target_entity='arpu_miseri_train',
                                       agg_primitives = agg_primitives,
                                       trans_primitives = trans_primitives,
                                       n_jobs = 8, verbose = 10, features_only = False,
                                       max_depth = 1000, chunk_size = 1000)

# DFS with specified primitives
feature_matrix2, feature_names2 = ft.dfs(entityset=es2, target_entity='arpu_miseri_test',
                                       agg_primitives = agg_primitives,
                                       trans_primitives = trans_primitives,
                                       n_jobs = 8, verbose = 10, features_only = False,
                                       max_depth = 1000, chunk_size = 1000)

Built 27 features
EntitySet scattered to 8 workers in 8 seconds                                                                          
Elapsed: 00:02 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████
Built 27 features
EntitySet scattered to 8 workers in 7 seconds                                                                          
Elapsed: 00:03 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [10]:
feature_matrix1.reset_index(inplace = True)
feature_matrix1.to_csv('data/arpu_feature_train.csv', index = False)
feature_matrix2.reset_index(inplace = True)
feature_matrix2.to_csv('data/arpu_feature_test.csv', index = False)

In [11]:
feature_train = pd.read_csv('data/arpu_feature_train.csv')
feature_test = pd.read_csv('data/arpu_feature_test.csv')

In [12]:
feature_train= feature_train.drop(['IS_NULL(msisdn)','IS_NULL(miseri_index_train)'],axis = 1)
feature_train= feature_train.drop(['CUM_MEAN(COL_20)','CUM_MEAN(COL_21)','CUM_MEAN(COL_22)','CUM_MEAN(COL_27a)','CUM_MEAN(COL_27b)','CUM_MEAN(COL_27c)','CUM_MEAN(COL_27d)'],axis = 1)
feature_test= feature_test.drop(['CUM_MEAN(COL_20)','CUM_MEAN(COL_21)','CUM_MEAN(COL_22)','CUM_MEAN(COL_27a)','CUM_MEAN(COL_27b)','CUM_MEAN(COL_27c)','CUM_MEAN(COL_27d)'],axis = 1)
feature_test = feature_test.drop(['IS_NULL(msisdn)','IS_NULL(miseri_index_test)'],axis = 1)
feature_train= feature_train.drop(['IS_NULL(COL_21)','IS_NULL(COL_22)','IS_NULL(COL_27a)','IS_NULL(COL_27b)','IS_NULL(COL_27c)','IS_NULL(COL_27d)'],axis = 1)
feature_test = feature_test.drop(['IS_NULL(COL_21)','IS_NULL(COL_22)','IS_NULL(COL_27a)','IS_NULL(COL_27b)','IS_NULL(COL_27c)','IS_NULL(COL_27d)'],axis = 1)
# feature_train= feature_train.drop(['PERCENTILE(COL_21)'],axis = 1)
# feature_test= feature_test.drop(['PERCENTILE(COL_21)'],axis = 1)

# feature_train= feature_train.drop(['CUM_MEAN(COL_19)'],axis = 1)
# feature_test= feature_test.drop(['CUM_MEAN(COL_19)'],axis = 1)

In [13]:
for col in feature_train.columns:
    # Check if the column is of object type
    if feature_train[col].dtypes == 'object':
        # Impute with the most frequent value
        feature_train = feature_train.fillna(0)

for col in feature_test.columns:
    # Check if the column is of object type
    if feature_test[col].dtypes == 'object':
        # Impute with the most frequent value
        feature_test = feature_test.fillna(0)
        
feature_train.to_csv('data/feature_train.csv', index = False)
feature_test.to_csv('data/feature_test.csv', index = False)

In [14]:
def rank_simple(vector):
    return sorted(range(len(vector)), key=vector.__getitem__)

def rankdata(a):
    n = len(a)
    ivec=rank_simple(a)
    svec=[a[rank] for rank in ivec]
    sumranks = 0
    dupcount = 0
    newarray = [0]*n
    for i in range(n):
        sumranks += i
        dupcount += 1
        if i==n-1 or svec[i] != svec[i+1]:
            averank = sumranks / float(dupcount) + 1
            for j in range(i-dupcount+1,i+1):
                newarray[ivec[j]] = averank
            sumranks = 0
            dupcount = 0
    return newarray

In [15]:
def get_age_bin(a):
    age_bin = []
    n = len(a)
    for i in a:
        if i <1950 or i >2000:
            age_bin.append(0)
        else:
            age_bin.append(1)
    return age_bin

In [16]:
feature_train['age_bin'] = get_age_bin(arpu_train_info['COL_14'])
feature_test['age_bin'] = get_age_bin(arpu_test_info['COL_14'])

In [17]:
arpu_train_info['COL_14'].fillna(arpu_train_info['COL_14'].mean(), inplace=True)
arpu_test_info['COL_14'].fillna(arpu_test_info['COL_14'].mean(), inplace=True)

In [20]:
feature_train['rank_col_19'] = rankdata(feature_train['COL_19'])
feature_train['rank_col_20']  = rankdata(feature_train['COL_20'])
feature_train['rank_col_21']  = rankdata(feature_train['COL_21'])
feature_train['rank_col_22']  = rankdata(feature_train['COL_22'])
feature_train['rank_col_27a']  = rankdata(feature_train['COL_27a'])
feature_train['rank_col_27b']  = rankdata(feature_train['COL_27b'])
feature_train['rank_col_27c']  = rankdata(feature_train['COL_27c'])
feature_train['rank_col_27d']  = rankdata(feature_train['COL_27d'])

feature_test['rank_col_19'] = rankdata(feature_test['COL_19'])
feature_test['rank_col_20']  = rankdata(feature_test['COL_20'])
feature_test['rank_col_21']  = rankdata(feature_test['COL_21'])
feature_test['rank_col_22']  = rankdata(feature_test['COL_22'])
feature_test['rank_col_27a']  = rankdata(feature_test['COL_27a'])
feature_test['rank_col_27b']  = rankdata(feature_test['COL_27b'])
feature_test['rank_col_27c']  = rankdata(feature_test['COL_27c'])
feature_test['rank_col_27d']  = rankdata(feature_test['COL_27d'])

In [21]:
feature_train= feature_train.drop(['COL_19', 'COL_20', 'COL_21', 'COL_22', 'COL_27a', 'COL_27b', 'COL_27c', 'COL_27d'],axis = 1)
feature_test= feature_test.drop(['COL_19', 'COL_20', 'COL_21', 'COL_22', 'COL_27a', 'COL_27b', 'COL_27c', 'COL_27d'],axis = 1)
feature_train= feature_train.drop(['miseri_index_train'],axis = 1)
feature_test= feature_test.drop(['miseri_index_test'],axis = 1)

In [22]:
feature_train.to_csv('data/feature_train.csv', index = False)
feature_test.to_csv('data/feature_test.csv', index = False)

In [23]:
feature_train

Unnamed: 0,msisdn,CUM_MEAN(COL_19),IS_NULL(COL_19),IS_NULL(COL_20),age_bin,rank_col_19,rank_col_20,rank_col_21,rank_col_22,rank_col_27a,rank_col_27b,rank_col_27c,rank_col_27d
0,00010016bb53e8e3c82e,90909.093750,False,True,1,32316.0,7588.5,27418.5,9790.0,2961.5,2961.5,5674.5,3034.0
1,0001007187b4881379a9,59560.096680,False,False,1,16446.0,25726.0,27418.5,33887.0,43355.0,43511.0,33294.0,43998.0
2,000100829f9d67fff3cc,112726.921224,False,False,1,51214.0,46250.0,27418.5,55837.0,46486.0,46665.0,48439.0,47242.0
3,000100a5e09da7ad6649,115903.003418,False,False,1,40729.0,54246.0,27418.5,23040.0,48753.0,48772.0,44247.0,48438.0
4,000100d7f3d9e70f62b0,133105.488672,False,False,1,50012.0,51209.0,27418.5,54171.0,46589.0,46758.0,50858.0,45080.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57716,0001e0e2c7aed96ff2fc,102311.797262,False,False,1,32769.0,51099.0,27418.5,43973.0,27347.0,27293.0,28337.0,31016.0
57717,0001f99f298dd21ce8c5,102310.539594,False,False,1,16880.0,40945.0,27418.5,21953.5,55881.0,55948.0,48627.0,56150.0
57718,0001fa0c3d21c0d758ec,102308.967581,False,False,1,11003.0,19320.5,27418.5,9790.0,31428.0,35769.0,15571.0,38978.0
57719,0001fc8336def40dff4b,102310.721938,False,False,1,50137.0,55911.0,27418.5,48049.0,37835.0,37793.0,38143.0,38927.0


# relationship

In [None]:
call_train = pd.read_csv('data/call_train.csv').replace({365243: np.nan})
call_test = pd.read_csv('data/call_test.csv').replace({365243: np.nan})
call_train = call_train.drop(['call_time','Unnamed','COL_1','PARTNER_MSISDN','call_train_index'], axis = 1)
call_test = call_test.drop(['call_time','Unnamed','COL_1','PARTNER_MSISDN','call_test_index'], axis = 1)

In [None]:
call_train

In [None]:
# Entity set with id applications
es3 = ft.EntitySet(id = 'es3')
es4 = ft.EntitySet(id = 'es4')

arpu_train_info_types = {}
arpu_test_info_types = {}
arpu_train_info_types['COL_13'] = vtypes.Categorical
arpu_train_info_types['COL_14'] = vtypes.Ordinal
arpu_train_info_types['COL_15'] = vtypes.Categorical
arpu_train_info_types['COL_16'] = vtypes.Categorical
arpu_train_info_types['COL_17'] = vtypes.Categorical
arpu_train_info_types['COL_18'] = vtypes.Categorical

arpu_test_info_types['COL_13'] = vtypes.Categorical
arpu_test_info_types['COL_14'] = vtypes.Ordinal
arpu_test_info_types['COL_15'] = vtypes.Categorical
arpu_test_info_types['COL_16'] = vtypes.Categorical
arpu_test_info_types['COL_17'] = vtypes.Categorical
arpu_test_info_types['COL_18'] = vtypes.Categorical

es3 = es3.entity_from_dataframe(entity_id = 'arpu_train_info', dataframe = arpu_train_info, index = 'msisdn',
                              variable_types = arpu_train_info_types)
es3 = es3.entity_from_dataframe(entity_id = 'call_train', dataframe = call_train, 
                              make_index = True, index = 'call_train_index')

es4 = es4.entity_from_dataframe(entity_id = 'arpu_test_info', dataframe = arpu_test_info, index = 'msisdn',
                              variable_types = arpu_test_info_types)
es4 = es4.entity_from_dataframe(entity_id = 'call_test', dataframe = call_test, 
                              make_index = True, index = 'call_test_index')

r_arpu_3 = ft.Relationship(es3['arpu_train_info']['msisdn'], es3['call_train']['msisdn'])
r_arpu_4 = ft.Relationship(es4['arpu_test_info']['msisdn'], es4['call_test']['msisdn'])

es3 = es3.add_relationships([r_arpu_3])
es4 = es4.add_relationships([r_arpu_4])

In [None]:
 primitives[primitives['type'] == 'aggregation']

In [None]:
# Default primitives from featuretools
agg_primitives =  ['mean','sum','trend']
trans_primitives = []

# DFS with specified primitives
feature_matrix3, feature_names3 = ft.dfs(entityset=es3, target_entity='arpu_train_info',
                                       agg_primitives = agg_primitives,
                                       trans_primitives = trans_primitives,
                                       n_jobs = 1, verbose = 10, features_only = False,
                                       max_depth = 1000, chunk_size = 1000)

# DFS with specified primitives
feature_matrix4, feature_names4 = ft.dfs(entityset=es4, target_entity='arpu_test_info',
                                       agg_primitives = agg_primitives,
                                       trans_primitives = trans_primitives,
                                       n_jobs = 1, verbose = 10, features_only = False,
                                       max_depth = 1000, chunk_size = 1000)

In [None]:
feature_matrix3.reset_index(inplace = True)
feature_matrix4.reset_index(inplace = True)

In [None]:
feature_matrix3

In [None]:
feature_matrix4