In [1]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import os, shutil
import time
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

import base64
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from sklearn import preprocessing


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = f"{name}-{tv}"
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"


# Regression chart.
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        t.sort_values(by=['y'], inplace=True)
    plt.plot(t['y'].tolist(), label='expected')
    plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean())
                          >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
    
# Load the data
train = pd.read_csv('Train.csv')

test = pd.read_csv('Test.csv')

sample_sub = pd.read_csv('sample_submission.csv')

variable_def = pd.read_csv('VariableDefinitions.csv')


# Set Index
train.set_index('user_id', inplace=True)
test.set_index('user_id', inplace=True)


train['REGION'] = train['REGION'].fillna('DAKAR')
test['REGION'] = test['REGION'].fillna('DAKAR')

encode_text_dummy(train, name='REGION')
encode_text_dummy(test, name='REGION')

def enco(data):
    if data == 'K > 24 month':
        out = 1
    else:
        out = 0
    return out
train['TENURE'] = train['TENURE'].apply(enco)
test['TENURE'] = test['TENURE'].apply(enco)


train['MONTANT'] = train['MONTANT'].fillna(train['MONTANT'].mode()[0])
test['MONTANT'] = test['MONTANT'].fillna(test['MONTANT'].mode()[0])

def bin_montant(data, min_val, max_val, range_val):
    data_copy = data.copy()
    range_of_vals = int(max_val+1 - min_val)
    batch_value = int(range_of_vals/range_val)
    dict_batch = {}
    start, stop = 0,0
    for i,j in enumerate(range(0,range_of_vals, batch_value)):
        stop = start + batch_value
        dict_batch[str(i)] = (start,stop)
        start = stop
        
        if stop <= range_of_vals:
            pass
        else:
            start = stop - batch_value
            stop = int(max_val) + 1
            
            dict_batch[str(i)] = (start,stop)
    print(dict_batch)       
    bin_out = []       
    for k in data_copy:
        out = [i for i,(start,stop) in zip(dict_batch.keys(), dict_batch.values()) if int(k) in range(start,stop)]
        #print(k,out)
        bin_out.append(int(out[0]))
    return bin_out

montant = bin_montant(train['MONTANT'], train['MONTANT'].min(), train['MONTANT'].max(), 30)
montant1 = bin_montant(test['MONTANT'], test['MONTANT'].min(), test['MONTANT'].max(), 30)

train['MONTANT_BIN'] = montant
test['MONTANT_BIN'] = montant1

train.drop('MONTANT', inplace=True, axis=1)
test.drop('MONTANT', inplace=True, axis=1)

def enco(data):
    if data == 0:
        out = 1
    else:
        out = 0
    return out

train['MONTANT_BIN'] = train['MONTANT_BIN'].apply(enco)
test['MONTANT_BIN'] = test['MONTANT_BIN'].apply(enco)

train['FREQUENCE_RECH'] = train['FREQUENCE_RECH'].fillna(train['FREQUENCE_RECH'].mean())
test['FREQUENCE_RECH'] = test['FREQUENCE_RECH'].fillna(test['FREQUENCE_RECH'].mean())
train['FREQUENCE_RECH'] = train['FREQUENCE_RECH'].round()
test['FREQUENCE_RECH'] = test['FREQUENCE_RECH'].round()

def bin_freq_rech(data, min_val, max_val, range_val):
    data_copy = data.copy()
    range_of_vals = int(max_val+1 - min_val)
    batch_value = int(range_of_vals/range_val)
    dict_batch = {}
    start, stop = 0,0
    for i,j in enumerate(range(0,range_of_vals, batch_value)):
        stop = start + batch_value
        dict_batch[str(i)] = (start,stop)
        start = stop
        
        if stop <= range_of_vals:
            pass
        else:
            start = stop - batch_value
            stop = int(max_val) + 1
            
            dict_batch[str(i)] = (start,stop)
    print(dict_batch)       
    bin_out = []       
    for k in data_copy:
        out = [i for i,(start,stop) in zip(dict_batch.keys(), dict_batch.values()) if int(k) in range(start,stop)]
        #print(k,out)
        bin_out.append(int(out[0]))
    return bin_out

freq_out = bin_freq_rech(train['FREQUENCE_RECH'], 1, 133, 10)
freq_out1 = bin_freq_rech(test['FREQUENCE_RECH'], 1, 120, 9)

train['FREQ_BIN'] = freq_out
test['FREQ_BIN'] =freq_out1

def enco(data):
    if data == 0:
        out = 1
    else:
        out =0
    return out

train['FREQ_BIN'] = train['FREQ_BIN'].apply(enco)
test['FREQ_BIN'] = test['FREQ_BIN'].apply(enco)
train.drop('FREQUENCE_RECH', inplace=True, axis=1)
test.drop('FREQUENCE_RECH', inplace=True, axis=1)


train['REVENUE'] = train['REVENUE'].fillna(train['REVENUE'].mean())
test['REVENUE'] = test['REVENUE'].fillna(test['REVENUE'].mean())

train['REVENUE'] = np.log1p(train['REVENUE'])
test['REVENUE'] = np.log1p(test['REVENUE'])

train['ARPU_SEGMENT'] = train['ARPU_SEGMENT'].fillna(train['ARPU_SEGMENT'].mean())
test['ARPU_SEGMENT'] = test['ARPU_SEGMENT'].fillna(test['ARPU_SEGMENT'].mean())

train['ARPU_SEGMENT'] = np.log1p(train['ARPU_SEGMENT'])
test['ARPU_SEGMENT'] = np.log1p(test['ARPU_SEGMENT'])

train['FREQUENCE'] = train['FREQUENCE'].fillna(train['FREQUENCE'].median())
test['FREQUENCE'] = test['FREQUENCE'].fillna(test['FREQUENCE'].median())

def bin_freq(data, min_val, max_val, range_val):
    data_copy = data.copy()
    range_of_vals = int(max_val+1 - min_val)
    batch_value = int(range_of_vals/range_val)
    dict_batch = {}
    start, stop = 0,0
    for i,j in enumerate(range(0,range_of_vals, batch_value)):
        stop = start + batch_value
        dict_batch[str(i)] = (start,stop)
        start = stop
        
        if stop <= range_of_vals:
            pass
        else:
            start = stop - batch_value
            stop = int(max_val) + 1
            
            dict_batch[str(i)] = (start,stop)
    print(dict_batch)       
    bin_out = []       
    for k in data_copy:
        out = [i for i,(start,stop) in zip(dict_batch.keys(), dict_batch.values()) if int(k) in range(start,stop)]
        #print(k,out)
        bin_out.append(int(out[0]))
    return bin_out
freq1 = bin_freq(train['FREQUENCE'], train['FREQUENCE'].min(), train['FREQUENCE'].max(), 9)
freq2 = bin_freq(test['FREQUENCE'], test['FREQUENCE'].min(), test['FREQUENCE'].max(), 9)

train['FREQ_BIN'] = freq1
test['FREQ_BIN'] = freq2

def enco(data):
    if data == 0:
        out = 1
    elif data == 1:
        out = 1
    else:
        out = 0
    return out
train['FREQ_BIN'] = train['FREQ_BIN'].apply(enco)
test['FREQ_BIN'] = test['FREQ_BIN'].apply(enco)
train.drop('FREQUENCE', inplace=True, axis =1)
test.drop('FREQUENCE', inplace=True, axis =1)

train['DATA_VOLUME'] = train['DATA_VOLUME'].fillna(train['DATA_VOLUME'].median())
test['DATA_VOLUME'] = test['DATA_VOLUME'].fillna(test['DATA_VOLUME'].median())

train['DATA_VOLUME'] = np.log1p(train['DATA_VOLUME'])
test['DATA_VOLUME'] = np.log1p(test['DATA_VOLUME'])

train['ON_NET'] = train['ON_NET'].fillna(train['ON_NET'].median())
test['ON_NET'] = test['ON_NET'].fillna(test['ON_NET'].median())

train['ON_NET']= np.log1p(train['ON_NET'])
test['ON_NET']= np.log1p(test['ON_NET'])

train['ORANGE'] = train['ORANGE'].fillna(train['ORANGE'].median())
test['ORANGE'] = test['ORANGE'].fillna(test['ORANGE'].median())


train['ORANGE'] = np.log1p(train['ORANGE'])
test['ORANGE'] = np.log1p(test['ORANGE'])

def bin_orange(data, min_val, max_val, range_val):
    data_copy = data.copy()
    range_of_vals = int(max_val+1 - min_val)
    batch_value = int(range_of_vals/range_val)
    dict_batch = {}
    start, stop = 0,0
    for i,j in enumerate(range(0,range_of_vals, batch_value)):
        stop = start + batch_value
        dict_batch[str(i)] = (start,stop)
        start = stop
        
        if stop <= range_of_vals:
            pass
        else:
            start = stop - batch_value
            stop = int(max_val) + 1
            
            dict_batch[str(i)] = (start,stop)
    print(dict_batch)       
    bin_out = []       
    for k in data_copy:
        out = [i for i,(start,stop) in zip(dict_batch.keys(), dict_batch.values()) if int(k) in range(start,stop)]
        #print(k,out)
        bin_out.append(int(out[0]))
    return bin_out
orange = bin_orange(train['ORANGE'], train['ORANGE'].min(), train['ORANGE'].max(), 8)
orange1 = bin_orange(test['ORANGE'], test['ORANGE'].min(), test['ORANGE'].max(), 8)
train['ORANGE'] = orange
test['ORANGE'] = orange1

train['TIGO'] = train['TIGO'].fillna(train['TIGO'].mean())
test['TIGO'] = test['TIGO'].fillna(test['TIGO'].mean())

train['TIGO'] = np.round(train['TIGO'])
test['TIGO'] = np.round(test['TIGO'])

def bin_tigo(data, min_val, max_val, range_val):
    data_copy = data.copy()
    range_of_vals = int(max_val+1 - min_val)
    batch_value = int(range_of_vals/range_val)
    dict_batch = {}
    start, stop = 0,0
    for i,j in enumerate(range(0,range_of_vals, batch_value)):
        stop = start + batch_value
        dict_batch[str(i)] = (start,stop)
        start = stop
        
        if stop <= range_of_vals:
            pass
        else:
            start = stop - batch_value
            stop = int(max_val) + 1
            
            dict_batch[str(i)] = (start,stop)
    print(dict_batch)       
    bin_out = []       
    for k in data_copy:
        out = [i for i,(start,stop) in zip(dict_batch.keys(), dict_batch.values()) if int(k) in range(start,stop)]
        #print(k,out)
        bin_out.append(int(out[0]))
    return bin_out
tigo = bin_tigo(train['TIGO'], train['TIGO'].min(), train['TIGO'].max(), 10)
tigo1 = bin_tigo(test['TIGO'], test['TIGO'].min(), test['TIGO'].max(), 10)

train['TIGO'] = tigo
test['TIGO'] = tigo1

def enco(data):
    if data == 0:
        out = 1
    else:
        out = 0
    return out

train['TIGO'] = train['TIGO'].apply(enco)
test['TIGO'] = test['TIGO'].apply(enco)

to_drop1 = ['ZONE1', 'ZONE2', 'MRG']

train.drop(to_drop1, inplace=True, axis=1)
test.drop(to_drop1, inplace=True, axis=1)

def bin_regu(data, min_val, max_val, range_val):
    data_copy = data.copy()
    range_of_vals = int(max_val+1 - min_val)
    batch_value = int(range_of_vals/range_val)
    dict_batch = {}
    start, stop = 0,0
    for i,j in enumerate(range(0,range_of_vals, batch_value)):
        stop = start + batch_value
        dict_batch[str(i)] = (start,stop)
        start = stop
        
        if stop <= range_of_vals:
            pass
        else:
            start = stop - batch_value
            stop = int(max_val) + 1
            
            dict_batch[str(i)] = (start,stop)
    print(dict_batch)       
    bin_out = []       
    for k in data_copy:
        out = [i for i,(start,stop) in zip(dict_batch.keys(), dict_batch.values()) if int(k) in range(start,stop)]
        #print(k,out)
        bin_out.append(int(out[0]))
    return bin_out
regu = bin_regu(train['REGULARITY'], train['REGULARITY'].min(), train['REGULARITY'].max(), 6)
regu1 = bin_regu(test['REGULARITY'], test['REGULARITY'].min(), test['REGULARITY'].max(), 6)

train['REGULARITY'] = regu
test['REGULARITY'] = regu1
train.drop('TOP_PACK', axis=1, inplace=True)
test.drop('TOP_PACK', axis=1, inplace=True)

train['FREQ_TOP_PACK'].fillna(train['FREQ_TOP_PACK'].mean(), inplace=True)
test['FREQ_TOP_PACK'].fillna(test['FREQ_TOP_PACK'].mean(), inplace=True)

{'0': (0, 7551), '1': (7551, 15102), '2': (15102, 22653), '3': (22653, 30204), '4': (30204, 37755), '5': (37755, 45306), '6': (45306, 52857), '7': (52857, 60408), '8': (60408, 67959), '9': (67959, 75510), '10': (75510, 83061), '11': (83061, 90612), '12': (90612, 98163), '13': (98163, 105714), '14': (105714, 113265), '15': (113265, 120816), '16': (120816, 128367), '17': (128367, 135918), '18': (135918, 143469), '19': (143469, 151020), '20': (151020, 158571), '21': (158571, 166122), '22': (166122, 173673), '23': (173673, 181224), '24': (181224, 188775), '25': (188775, 196326), '26': (196326, 203877), '27': (203877, 211428), '28': (211428, 218979), '29': (218979, 226530), '30': (226530, 226551)}
{'0': (0, 6715), '1': (6715, 13430), '2': (13430, 20145), '3': (20145, 26860), '4': (26860, 33575), '5': (33575, 40290), '6': (40290, 47005), '7': (47005, 53720), '8': (53720, 60435), '9': (60435, 67150), '10': (67150, 73865), '11': (73865, 80580), '12': (80580, 87295), '13': (87295, 94010), '14':

### Modeling

In [2]:
# Extract the Feature Vectors and Target
y = train['CHURN'].values
train.drop('CHURN', inplace=True, axis=1)
X = train.values
test_data = test.values

# Encode the target
from keras.utils import to_categorical
y = to_categorical(y)

In [4]:
# Build model
import keras
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential
from keras.regularizers import l1, l2
from keras.callbacks import TensorBoard, LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

model = Sequential()
model.add(Dense(32, input_shape=(25,), activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(1e-3)))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu', kernel_regularizer=l2(1e-3)))
model.add(Dense(256, activation='relu'))
model.add(Dense(512, activation='relu', kernel_regularizer=l2(1e-3)))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(512, activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(loss=['categorical_crossentropy'], optimizer='rmsprop', metrics=['accuracy'])

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
import statistics
import time

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
epochs_needed = []
score = []
fold = 0
for train, test in kfold.split(X, y):
    start_time = time.time()
    fold += 1
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]
    
    monitor_1 = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=0, mode='auto', restore_best_weights=True)
    monitor_2 = ModelCheckpoint(monitor='val_loss', filepath='my_model.h5', save_best_only=True)
    monitor_3 = ReduceLROnPlateau(monitor='val_loss', patience=5, factor=0.1)
    monitor_4 = TensorBoard(log_dir = 'my_log_dir', histogram_freq=1)
    print('Training fold #,', fold)
    
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=100, callbacks=[monitor_1, monitor_2, monitor_3, monitor_4])
    

Training fold #, 1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training fold #, 2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training fold #, 3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

In [9]:
ay = model.predict_proba(test_data)

In [11]:
sub = pd.DataFrame()
sub['user_id'] = sample_sub['user_id']
sub['Churn'] = ay[:, 1]

In [15]:
sub.loc[500:600,:]

Unnamed: 0,user_id,Churn
500,84220893d61733416d3c98ae789dfc091f2cb173,0.5
501,118f5f3f99a718ea2dd49c3f93050cec0800e6ed,0.5
502,39bc0a79b0ccc1b652372c300344d4defabf6fe6,0.5
503,e08b899018589f8c6870e6508d70bed2b2226067,0.5
504,d7511d775f1251565dac3cd13329aa79125d2bf2,0.5
...,...,...
596,274f9149c6a9a130a7e82a4c81b9534038550da5,0.5
597,3fd0f9ac20f0f1a6eaf04be5439a48164fac35f8,0.5
598,e4f7a5186364ffff7a026e543a0f7ee894680dc3,0.5
599,8e6f43168e133e74e406790943974a4a648e01ba,0.5


In [3]:
import pandas as pd
import numpy as np
no_1 = pd.read_csv('new.csv')
no_2 = pd.read_csv('blending.csv')
no_3 = pd.read_csv('cat.csv')

In [4]:
first_pred = np.array(no_1['Churn'])
second_pred = np.array(no_2['Churn'])
third_pred = np.array(no_3['Churn'])

In [7]:
mean_all = (first_pred+second_pred+third_pred)/3

In [9]:
sub = pd.DataFrame()
sub['user_id'] = no_1['user_id']
sub['Churn'] = mean_all
sub.to_csv('mean.csv', index=False)