In [1]:
# Librairies
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline
import random
from pprint import pprint
import time
from joblib import Parallel, delayed, parallel_backend, Memory
import math

In [9]:
#Differents functions
# spliting datas
def train_test_split(data, test_size):
    
    #check if test_size is a float
    if isinstance(test_size, float):
        test_size = round(test_size * len(data))
    
    #retrieve all indices
    indices = data.index.tolist()
    
    #select random indices according to test_size
    test_indices = random.sample(population=indices, k=test_size)
    
    #spliting datas in training & testing
    test_data = data.loc[test_indices]
    train_data = data.drop(test_indices)
    
    return train_data, test_data

#Check data purity
def check_purity(data):
    #Return purity value regarding to the number of unique classes
    return  (False,True)[len(np.unique(data[:,-1])) == 1]
    
#Normal classification using majority
def classify(data):
    #get all classes & count occurences
    label = data[:,-1]
    uniques_classes, counts = np.unique(data[:,-1], return_counts = True)
    
    #get index of most common class
    index = counts.argmax()
    
    #return the most common class
    return uniques_classes[index]

# Modified Classification using proposed approach
def modified_classify(data):
    #initializing
    P  = 0
    M = 0
    n_row , _ = data.shape
    index_montant = 8
    index_taux = 10
    index_nbe = 9
    for line in range(n_row):
        if data[line,-1] == 'IMPAYE':
            P += data[line,index_montant]
        else:
            M += (data[line, index_taux]/100) * data[line,index_montant] * data[line,index_nbe]/12
        #print("({} , {})".format(data[line,index_montant], data[line, index_taux]))
    
    #print("({}, {})".format(P, M))
    
    return ('IMPAYE', 'PAYE')[M > P]
        

#Calculate all potential splits
def get_splits(data):
    
    #initialize our dict of potential splits
    splits = {}
    
    #initialize all attribute potential splits list
    _, n_cols = data.shape
    for col in range(n_cols - 1):
        if col == 8:
            continue
        splits[col] = list()
    
    # Compute all attributes potential splits
    for col in range(n_cols - 1):
        #get unique datas
        #print("Current col: {}".format(col))
        if col == 8:
            continue
        values = np.unique(data[:, col])
        #populate our dict
        feature_type = FEATURE_TYPES[col]
        if feature_type == "Continous":
            for index in range(1, len(values)):
                current_value = values[index]
                #print(current_value)
                previous_value = values[index - 1]
                potential_split = np.mean([current_value, previous_value])
                splits[col].append(potential_split)
        else:
            splits[col] = values
    return splits

#Spliting data
def split_data(data, feature_col, value):
    
    feature_type = FEATURE_TYPES[feature_col]
    
    #define all masks
    
    if feature_type == "Continous":
        mask_inf = data[:, feature_col] <= value
        mask_sup = data[:, feature_col] > value
    else:
        mask_inf = data[:, feature_col] == value
        mask_sup = data[:, feature_col] != value
    
    #data spliting
    data_inf = data[mask_inf]
    data_sup = data[mask_sup]
    
    return data_inf, data_sup


#Calculate chosen metric
def calculate_metric(data):
     #get classes
    label_class = data[:, -1]
    #get counts for each class
    _, counts = np.unique(label_class, return_counts=True)
    
    probabilities = counts / counts.sum()
    
    #computing metric value depending on the user choice
    if METRIC == "entropy":
        probabilities = counts / counts.sum()
        computed_metric = sum(probabilities * -np.log2(probabilities))
    elif METRIC == "gini":
        probabilities **=2
        computed_metric = 1- sum(probabilities)
    
    return computed_metric

# Overall metric value
def overall_metric(data_inf, data_sup):
    #get number of datas
    data_all_lenght = len(data_inf) + len(data_sup)
    
    #compute overall metric value
    metric_data_inf = (len(data_inf) / data_all_lenght)*calculate_metric(data_inf)
    metric_data_sup = (len(data_sup) / data_all_lenght)*calculate_metric(data_sup)
    overall_metric= metric_data_inf + metric_data_sup
    
    return overall_metric

#Modified metric: Here the aim is to compute the total lost that we'll have
def modified_metric(data):
    #initializing
    P  = 0
    M = 0
    n_row , _ = data.shape
    index_montant = 8
    index_taux = 10
    index_nbe = 9
    for line in range(n_row):
        if data[line,-1] == 'IMPAYE':
            P += data[line,index_montant]
        #else:
            #M += (data[line, index_taux]/100) * data[line,index_montant] 
    #print(P)
    return P

#Compute our overall modified metric
def overall_modified_metric(data_inf, data_sup):
    #get number of datas
    data_all_lenght = len(data_inf) + len(data_sup)
    
    #compute overall metric value
    metric_data_inf = (len(data_inf) / data_all_lenght)*modified_metric(data_inf)
    metric_data_sup = (len(data_sup) / data_all_lenght)*modified_metric(data_sup)
    overall_metric= metric_data_inf + metric_data_sup
    #print("({}, {})".format(metric_data_inf, metric_data_sup))
    
    return overall_metric

#parralelized function
def computing_best_column(data, colum_index, value):
    
    global best_split_column, best_split_value, overall_metric_value
    
    #print("current column: {}".format(colum_index))
    data_inf, data_sup = split_data(data, colum_index, value)
    current_overall = overall_modified_metric(data_inf, data_sup)
            #print(current_overall)
            #check if lower
    if current_overall <= overall_metric_value:
        print("cur,met, val, col, : ({}, {},{},{})".format(current_overall, overall_metric_value, colum_index, value))
        overall_metric_value = current_overall
        best_split_column = colum_index
        best_split_value = value
        #print("Done !! bests: ({}, {}, {})".format(best_split_column, best_split_value, overall_metric_value))

#determine best split attribute and value
def determine_best_split(data, potential_splits):
     
    #print(potential_splits)
    
    #So let's implement parralel version of our super code
    #Parallel(n_jobs=-1, require='sharedmem')(
     #   delayed(func)(args)
    #)
    #global best_split_column, best_split_value, overall_metric_value
    
    overall_metric_value = 300000000000000
    
    #with Parallel(n_jobs=-1, backend="threading", require="sharedmem", verbose=5) as parallel:
      #  parallel(delayed(computing_best_column)(data, colum_index, value) for colum_index in potential_splits for value in potential_splits[colum_index]) 
    
    for colum_index in potential_splits:
        #print("current column: {}".format(colum_index))
        for value in potential_splits[colum_index]:
            #print("current value: {}".format(value))
            data_inf, data_sup = split_data(data, colum_index, value)
            current_overall = overall_metric(data_inf, data_sup)
          #print(current_overall)
          #check if lower
            if current_overall <= overall_metric_value:
                print("cur,met, val, col, : ({}, {},{},{})".format(current_overall, overall_metric_value, colum_index, value))
                overall_metric_value = current_overall
                best_split_column = colum_index
                best_split_value = value
    #print(best_split_value)
    #Loop over all datas, calculate overall_entropy, and update if it's lower
    
    print("Final Done !! bests: ({}, {}, {})".format(best_split_column, best_split_value, overall_metric_value))
    return best_split_column, best_split_value, overall_metric_value

#building decision Tree
def decision_tree(df, counter=0, min_samples=5, max_depth=5, metric="entropy"):
    
    if counter == 0:
        data = df.values
        global COLUMNS_NAMES, FEATURE_TYPES, METRIC
        COLUMNS_NAMES = df.columns[:-1]
        FEATURE_TYPES = determine_feature_types(df)
        METRIC = metric
    else:
        data = df
        
    #base case
    
    if check_purity(data) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify(data)
        return classification
    
    else:
        counter +=1
        #computations for right and left part
        potential_splits = get_splits(data)
        best_split_column, best_split_value, overall_metric_value = determine_best_split(data, potential_splits)
        #We must change data_inf & data_sup order later
        data_inf, data_sup = split_data(data, best_split_column, best_split_value)
        
        # Creating subTree
        feature_type = FEATURE_TYPES[best_split_column]    
        if feature_type == "Continous":
            question = "{} <= {}".format(COLUMNS_NAMES[best_split_column], best_split_value)
        else:
            question = "{} == {}".format(COLUMNS_NAMES[best_split_column], best_split_value)
        #Adding labels
        labels = " " + str(len(data_inf) + len(data_sup)) + " " + str(overall_metric_value)
        question += labels
        sub_tree = {question: []}
        print(question)
            
        
        #left and right
        yes_answer = decision_tree(data_inf, counter, min_samples, max_depth, metric=METRIC)
        no_answer = decision_tree(data_sup, counter, min_samples, max_depth, metric=METRIC)
        
        if yes_answer == no_answer:
            print(yes_answer)
            sub_tree = yes_answer
        else:
            #Append left and right part
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree
        
#classify a sample
def classify_sample(sample, tree):
    
    #get node elements
    if not isinstance(tree, dict):
        return tree
    key = list(tree.keys())[0]
    #print(key.split())
    feature, comp_op, value, _, _ = key.split()
    
    if comp_op == "<=":
        if sample[feature] <= float(value):
            answer = tree[key][0]
        else:
            answer = tree[key][1]
    else:
        if str(sample[feature]) == value:
            answer = tree[key][0]
        else:
            answer = tree[key][1]
        
    #test base case
    if not isinstance(answer, dict):
        #print('yes')
        return answer
    else:
        return classify_sample(sample, answer)

#compute accuracy
def my_accuracy(df, tree):
    
    df["classification"] = df.apply(classify_sample, axis=1, args=(tree,))
    df["classification_correct"] = df.classification == df.ENIMPAYEOUPAS
    
    accuracy = df.classification_correct.mean()
    
    return accuracy, df
    

# In order to handle non-continous values
# We have to identify all features type in our dataset
def determine_feature_types(data):
    
    features_type  = []
    threshold = 931
    for col in data.columns:
        
        uniques_val = data[col].unique()
        sample = uniques_val[0]
        
        if (isinstance(sample, str)):
            features_type.append("Categorical")
        else:
            features_type.append("Continous")
    
    return features_type

#Bagging process
def bagging(train, test, metric, n_estimators, sample_size):
    
    decision_trees = dict()
    size = round(len(train)*sample_size)
    df_classes = pd.DataFrame()
    test_final = test.copy()
    #print(test_final.columns)
    #Initialiazing estimators
    for i in range(n_estimators):
        decision_trees[i] = list()
        
        #Random samples
        train_sample = train.sample(size, replace=True)
        my_tree = decision_tree(train_sample, min_samples=60, max_depth=5, metric=metric)
        acc, df = my_accuracy(test, my_tree)
        decision_trees[i] = [my_tree, acc, df]
        df_classes["estimator_class_" + str(i)] = df.classification
    #print(df_classes.columns)
    print(test_final.columns)
    test_final["classification"] = df_classes.mode(axis=1)[0]
    test_final["classification_correct"] = test_final.classification == test_final.ENIMPAYEOUPAS
    
    return decision_trees, test_final

def compute_metrics(df):
    
    # 1- Accuracy
    accuracy = df.classification_correct.mean()
    
    #2- Recall, precision
    TP = len(df[(df.ENIMPAYEOUPAS == 'PAYE') & (df.classification == 'PAYE')])
    FP = len(df[(df.ENIMPAYEOUPAS == 'IMPAYE') & (df.classification == 'PAYE')])
    FN = len(df[(df.ENIMPAYEOUPAS == 'PAYE') & (df.classification == 'IMPAYE')])
    TN = len(df[(df.ENIMPAYEOUPAS == 'IMPAYE') & (df.classification == 'IMPAYE')])

    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    Precision, Recall
    
    #3- Lost, no_win
    #Computing new matrix confusion values
    P = df.Montant[(df.ENIMPAYEOUPAS == 'IMPAYE') & (df.classification == 'PAYE')].sum()

    df_no_win = df[["Montant", "Taux", "Nbreech."]][(df.ENIMPAYEOUPAS == 'PAYE') & (df.classification == 'IMPAYE')]
    df_no_win['Taux'] /=100
    df_no_win["NoWin"] = df_no_win.Montant * df_no_win.Taux * df_no_win['Nbreech.']/12

    M = df_no_win.NoWin.sum()
    P, M
    
    4# Error
    error = (FP / (TP + FP)) + (FN / FN + TN)
    
    #Friedman Test
    #f, _ = friedmanchisquare(df.kredit.values, df.classification.values)
    
    #resulting metric list
    result = [(accuracy, Precision, Recall, error), (P, M), (TP, FP, FN, TN)]
    
    return result

In [4]:
datas = pd.read_excel('New_datas_first.xlsx', index=False)
cols = ['Type', 'Fonction', 'Civilité', 'Sit.Matrim', 'Paysresidence', 'Dept',
       'Interd.Chq?', 'CODEAGENCE', 'Motif',
       'ENIMPAYEOUPAS']
for col in cols:
    datas[col] = datas[col].astype(str)

In [None]:
#Execution
random.seed(30)
#datas = pd.read_excel('New_datas_first.xlsx', index=False)
train, test = train_test_split(datas, 0.3)
dt, df = bagging(train, test,"entropy", 100, 0.8)

# Saving datas
df.to_excel('classic_entropy.xlsx', index=False)
compute_metrics(df)

In [15]:
#Computing new matrix confusion values
P = df.Montant[(df.ENIMPAYEOUPAS == 'IMPAYE') & (df.classification == 'PAYE')].sum()

df_no_win = df[["Montant", "Taux", "Nbreech."]][(df.ENIMPAYEOUPAS == 'PAYE') & (df.classification == 'IMPAYE')]
df_no_win['Taux'] /=100
df_no_win["NoWin"] = df_no_win.Montant * df_no_win.Taux * df_no_win['Nbreech.']/12

M = df_no_win.NoWin.sum()
P, M

(330987679.75, 19376215.741666667)

In [16]:
TP = len(df[(df.ENIMPAYEOUPAS == 'PAYE') & (df.classification == 'PAYE')])
FP = len(df[(df.ENIMPAYEOUPAS == 'IMPAYE') & (df.classification == 'PAYE')])
FN = len(df[(df.ENIMPAYEOUPAS == 'PAYE') & (df.classification == 'IMPAYE')])

Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
Precision, Recall

(0.9873612823674476, 0.9804101622283441)

In [17]:
df.to_excel('classic_entropy.xlsx', index=False)
TP, FP, FN

(6406, 82, 128)

In [37]:
df.to_excel('results/classic_entropy_veh.xlsx', index=False)

In [48]:
df = pd.read_excel('results/dataset_2/DT1- Entropy.xlsx', index=False)

In [49]:
#Computing new matrix confusion values
#P = df.montant[(df.STATUT == "IMPAYE") & (df.classification == "PAYE")].sum()

df_lost = df[["montant", "durée_en_mois"]][(df.STATUT == "IMPAYE") & (df.classification == "PAYE")]
df['no_lost'] = df_lost.montant

P = df.no_lost.sum()

R = df.montant[(df.STATUT == "PAYE") & (df.classification == "IMPAYE")].sum()
df_no_win = df[["montant", "taux", "durée_en_mois"]][(df.STATUT == "PAYE") & (df.classification == "IMPAYE")]
df_no_win['taux'] /=100
df_no_win["NoWin"] = df_no_win.montant * df_no_win.taux

M = df_no_win.NoWin.sum()
P, M, R

(4515654557.0, 23109586.49, 159569562.0)

In [27]:
# Somes others helpers values
def compute_years(value):
    words = value.split()
    
    final_value = 0
    
    for word in words:
        number = int(word[0])
        
        if(word[1] == 'y'):
            number *= 12 
        
        final_value += number
    return final_value

#Removing spaces
def remove_space(words):
        
    #resulting word
    concat_word = ''
    #processing
    for word in words.split():
        concat_word  += word
    print(concat_word)
    return concat_word


#renaming columns
def process_file(data):
    
    for col in data.columns.tolist()[1:21]:
        #new_col = col.split()
        
        #Rename columns with white spaces
        #new_col = remove_space(col)
        #data = data.rename(columns={col: new_col})
        
        #Process all datas by removing all spaces
        data[col] = data[col].apply(remove_space)
        #df.apply(classify_sample, axis=1, args=(tree,))
        
    
    return data

def process_data(train):
    #train['Rate'] = 100 - train.ltv
    
    #train['Date.of.Birth'] = ((pd.Timestamp.now().normalize() - pd.to_datetime(train['Date.of.Birth'], errors = 'coerce'))/np.timedelta64(1, 'Y')).astype(int)
    #train['DisbursalDate'] = ((pd.Timestamp.now().normalize() - pd.to_datetime(train['DisbursalDate'], errors = 'coerce'))/np.timedelta64(1, 'M')).astype(int)
    
    #train.loc[train['Date.of.Birth'] < 0, 'Date.of.Birth'] = train['Date.of.Birth'].median()
    #train.loc[train['DisbursalDate'] < 0, 'DisbursalDate'] = train['DisbursalDate'].median()
    
    #train['Employment.Type'] = train['Employment.Type'].astype(str)
    #train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].astype(str)
    
    # Delete useless value
    #train = train.drop(['PRI.CURRENT.BALANCE'], axis=1)
    
    for col in ['SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'branch_id', 'manufacturer_id', 'Date.of.Birth',
       'Employment.Type', 'State_ID', 'MobileNo_Avl_Flag', 'Aadhar_flag',
       'PERFORM_CNS.SCORE', 'PERFORM_CNS.SCORE.DESCRIPTION', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS']:
        train[col] = train[col].astype(str)
    
    #train['AVERAGE.ACCT.AGE'] = train['AVERAGE.ACCT.AGE'].apply(remove_space)
    #train['CREDIT.HISTORY.LENGTH'] = train['CREDIT.HISTORY.LENGTH'].apply(remove_space)

    
    #Change Rate position
    #columns = train.columns.tolist()
    #columns.remove('loan_default')
    #columns.append('loan_default')
    #train = train[columns]
    
    #train = process_file(train)
    
    return train

