# Dataset cleaning

## Libraries and datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

## Feature engineering

Drop MPG because of the same value in each row.

In [3]:
train.drop(['MRG'], axis = 1, inplace=True)
test.drop(['MRG'], axis = 1, inplace=True)

Drop user_id from train

In [4]:
train.drop(['user_id'], axis = 1, inplace=True)
test.drop(['user_id'], axis = 1, inplace=True)

Drop top packs for the first iteration 

In [5]:
train.drop(['TOP_PACK'], axis = 1, inplace=True)
test.drop(['TOP_PACK'], axis = 1, inplace=True)

Convert tenure into the int format

In [6]:
train['TENURE'] = train['TENURE'].map({'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 
                                       'G 12-15 month':12, 'J 21-24 month': 21, 'F 9-12': 9,
                                       'E 6-9 month':6, 'D 3-6 month':3})

In [7]:
test['TENURE'] = test['TENURE'].map({'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 
                                       'G 12-15 month':12, 'J 21-24 month': 21, 'F 9-12': 9,
                                       'E 6-9 month':6, 'D 3-6 month':3})

In [8]:
train['TENURE'].fillna(1, inplace=True) # fill nans with unkown
test['TENURE'].fillna(1, inplace=True)

X / y samples

In [9]:
y = train['CHURN']
train.drop(['CHURN'], axis = 1, inplace=True)

In [10]:
X = train

Train-val split

In [11]:
X, X_val, y, y_val = train_test_split(X,y,test_size = 0.2,random_state=1)

Encoding of categorical features

In [12]:
X['REGION'].fillna('other', inplace=True) # fill nans with unkown
X_val['REGION'].fillna('other', inplace=True) 
test['REGION'].fillna('other', inplace=True)

In [13]:
encoder = LabelEncoder() 
X['REGION'] = encoder.fit_transform(X['REGION'])
X_val['REGION'] = encoder.transform(X_val['REGION'])
test['REGION'] = encoder.transform(test['REGION'])

Scaling

In [14]:
num_cols = ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE',
       'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2',
       'REGULARITY', 'FREQ_TOP_PACK']

In [15]:
scaler = StandardScaler()

In [16]:
X[num_cols] = scaler.fit_transform(X[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

Imputing missing values

In [17]:
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imp = imp.fit(X[num_cols])

In [18]:
X[num_cols] = imp.transform(X[num_cols])
X_val[num_cols] = imp.transform(X_val[num_cols])
test[num_cols] = imp.transform(test[num_cols])

## "First generation" dataset production

In [19]:
X

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,FREQ_TOP_PACK
957056,12,18.0,-0.638518,-0.718161,-0.718841,-0.718706,-0.747321,-0.252563,-0.315014,-0.377476,-0.332739,0.000000,0.000000,0.446632,0.000000
536962,8,24.0,-0.708945,-0.793489,-0.699894,-0.699898,-0.747321,-0.249717,-0.313873,-0.450555,0.000000,0.000000,-0.220128,0.581223,0.000000
1575854,0,24.0,0.002368,1.541680,0.190890,0.190749,1.702509,0.028720,-0.212311,-0.236189,0.454859,0.000000,0.000000,-0.091734,1.363902
1543057,9,24.0,0.699596,0.110447,0.698418,0.698556,0.613696,-0.252563,0.355982,-0.338500,-0.316987,-0.208879,0.000000,1.433635,-0.267198
1520724,14,24.0,-0.596262,-0.416849,-0.586770,-0.586634,-0.407067,-0.186736,-0.273933,-0.401836,-0.316987,0.000000,0.000000,0.715815,-0.593418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73349,14,24.0,0.122094,-0.115537,0.206633,0.206631,0.001238,0.017487,-0.315014,-0.012078,0.785650,0.000000,0.000000,-0.271189,-0.348753
836489,11,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.251140,-0.292191,0.000000,0.000000,0.000000,0.000000,-1.123601,0.000000
491263,0,24.0,-0.004674,-0.040209,0.067875,0.067872,-0.134863,0.000000,-0.295615,-0.002334,0.360347,0.000000,0.000000,-0.226325,0.222132
491755,14,1.0,0.000000,0.000000,-0.767462,-0.767605,-0.883422,-0.252563,-0.313873,0.000000,0.000000,0.000000,0.000000,-0.989009,0.000000


In [20]:
X_val

Unnamed: 0,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,REGULARITY,FREQ_TOP_PACK
417912,7,24.0,4.432232,0.713072,4.539630,4.539496,0.817848,-0.116789,1.192446,5.205805,2.439605,0.0,0.0,1.523363,1.934787
1380278,12,24.0,-0.286383,-0.341521,-0.280415,-0.280278,-0.475117,0.000000,-0.283062,-0.026694,-0.253979,0.0,0.0,-0.226325,-0.185643
657158,14,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,-0.181461,0.000000
228934,14,18.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,-1.213328,0.000000
2020145,2,24.0,1.770088,0.336431,1.753595,1.753456,0.205391,0.000000,0.094659,0.484863,-0.316987,0.0,0.0,1.119589,0.629907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1261773,0,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.312732,0.000000,0.000000,0.0,0.0,-0.764690,0.000000
494205,14,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,-0.136597,0.000000
1288545,0,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.247246,0.000000,0.000000,0.000000,0.0,0.0,-0.674963,0.000000
391519,12,24.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,-0.764690,0.000000


In [21]:
y

957056     0
536962     0
1575854    0
1543057    0
1520724    0
          ..
73349      1
836489     0
491263     0
491755     1
128037     0
Name: CHURN, Length: 1723238, dtype: int64

In [22]:
y_val

417912     0
1380278    0
657158     1
228934     1
2020145    0
          ..
1261773    0
494205     1
1288545    0
391519     0
245290     0
Name: CHURN, Length: 430810, dtype: int64

Index(['REGION', 'TENURE', 'MONTANT', 'FREQUENCE_RECH', 'REVENUE',
       'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO',
       'ZONE1', 'ZONE2', 'REGULARITY', 'FREQ_TOP_PACK'],
      dtype='object')

In [23]:
X.to_csv('0_X.csv')
y.to_csv('0_y.csv')
X_val.to_csv('0_X_val.csv')
y_val.to_csv('0_y_val.csv')