In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
plt.style.use('seaborn-bright')
# print(plt.style.available)

In [None]:
df = pd.read_csv('data/train_processed_2.csv')
df.head()

## Split Test Data into Train and CV

In [None]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1].astype(int)
print(x.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
trainx,cvx, trainy,cvy = train_test_split(x,y, test_size=0.1, random_state=42, stratify=y)

# After we randomly split the dataset, we've the following class proportions in percent: 
print('All:', np.bincount(y) / len(y) * 100.0)
print('Training:', np.bincount(trainy) / len(trainy) * 100.0)
print('Test:', np.bincount(cvy) / len(cvy) * 100.0)

## Modeling: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(trainx,trainy)
print('Classification Score on trainx/y :', clf.score(trainx, trainy)) # Perfect score  1
print('Classification Score on trainx/y :', clf.score(cvx, cvy)) # Perfect score  1

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

# Get just the probability of class in 0th-column and that implies getting the probability of "Not Renewing the Policy"
# trainy_prob = clf.predict_proba(trainx)[:,0] # Get just the probability of No-renewal (class in 0th-column)
# fpr, tpr, thresholds = roc_curve(trainy, trainy_prob, pos_label=1)
# roc_auc = auc(1-fpr, 1-tpr) # Because we want the P(renewing the policy) when we know the P(NOT renewing the policy)

# cvy_prob = clf.predict_proba(cvx)[:,0] # Get just the probability of class in first-column
# fpr_cv, tpr_cv, threshold_cv = roc_curve(cvy, cvy_prob, pos_label=1)
# roc_auc_cv = auc(1-fpr_cv, 1-tpr_cv)


# Get just the probability of class in 1st-column and that implies getting the probability of "Renewing the Policy"
trainy_prob = clf.predict_proba(trainx)[:,1] # Get just the probability of Renewal (class in 1st-column)
fpr, tpr, thresholds = roc_curve(trainy, trainy_prob, pos_label=1)
roc_auc = auc(fpr, tpr) # Because we want the P(renewing the policy)

cvy_prob = clf.predict_proba(cvx)[:,1] # Get just the probability of class in 1st-column, P()
fpr_cv, tpr_cv, threshold_cv = roc_curve(cvy, cvy_prob, pos_label=1)
roc_auc_cv = auc(fpr_cv, tpr_cv)


plt.figure(figsize=(5, 5))
plt.plot(fpr, tpr, color='darkorange', alpha=0.5, lw=2, linestyle='-', label='Training ROC Curve (area = {0:.2f})'.format(roc_auc))
plt.plot(fpr_cv, tpr_cv, color='deeppink', alpha=0.5, lw=2, linestyle='-', label='Testing ROC Curve (area = {0:.2f})'.format(roc_auc_cv))
plt.plot([0,1], [0,1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()

In [None]:
# In order to calculate the AUC, you need to have probabilities.
roc_auc_score(trainy, trainy_prob)

## Preliminary Pre-processing of TEST dataset


In [None]:
test_df = pd.read_csv('data/test.csv')
test_df.shape

In [None]:
test_df.head()

In [None]:
test_df = test_df.rename(columns={'Income':'income',
                   'Count_3-6_months_late':'count_3-6_months_late', 
                   'Count_6-12_months_late':'count_6-12_months_late',
                   'Count_more_than_12_months_late':'count_more_than_12_months_late'
                  })
test_df.head(5)

In [None]:
# test_df.drop(columns=['id'],axis=1,inplace=True) # Don't because it is requirerd for final submimssion
# test_df.head()
iddf = test_df['id']
print(iddf.shape)
test_df = test_df.iloc[:,1:]
test_df.head()

In [None]:
test_df['age_in_yrs'] = (test_df['age_in_days'] / 365).astype(int)
test_df.drop(columns=['age_in_days'],axis=1, inplace=True)
test_df.head()

In [None]:
print(test_df.isnull().sum())
tmp = test_df[['count_3-6_months_late', 'count_6-12_months_late', 'count_more_than_12_months_late']].fillna(0)
test_df.update(tmp)
print(test_df.isnull().sum())

In [None]:
test_df.update( test_df['application_underwriting_score'].fillna(99.89) ) # Filling with mode value of test set
test_df.isnull().sum()

## More Pre-processing of TEST dataset


In [None]:
test_df['income'] = test_df['income'].apply(np.log).round(2)

In [None]:
from sklearn import preprocessing
data_scalar = preprocessing.MinMaxScaler()

test_df['age_in_yrs'] = data_scalar.fit_transform(test_df[['age_in_yrs']]).flatten().round(3)
test_df['application_underwriting_score'] = data_scalar.fit_transform(test_df[['application_underwriting_score']]).flatten().round(3)
test_df['no_of_premiums_paid'] = data_scalar.fit_transform(test_df[['no_of_premiums_paid']]).flatten().round(2)
test_df['premium'] = test_df['premium'].apply(np.log).round(4)

In [None]:
# Importing  utility function categorize from util.py defined in this directory of project
from util import categorize

tmp_df = categorize(test_df[['sourcing_channel']])
test_df = test_df.join(tmp_df)

tmp_df = categorize(test_df[['residence_area_type']])
test_df = test_df.join(tmp_df)

test_df.drop(['sourcing_channel', 'residence_area_type'], axis=1, inplace=True)

In [None]:
test_df.head()

In [None]:
test_df = test_df[['age_in_yrs', 
               'income', 
               'application_underwriting_score', 
               'premium',
               'perc_premium_paid_by_cash_credit', 
               'no_of_premiums_paid',
               'count_3-6_months_late', 'count_6-12_months_late', 'count_more_than_12_months_late', 
               'sourcing_channel_A', 'sourcing_channel_B', 'sourcing_channel_C', 'sourcing_channel_D', 'sourcing_channel_E', 
               'residence_area_type_Rural', 'residence_area_type_Urban',
               ]]

In [None]:
test_df.head()

In [None]:
# testx = test_df.iloc[:,:-1]
# testy = test_df.iloc[:,-1].astype(int)
# print(testx.shape)
# print(testy.shape)
testx = test_df
print(testx.shape)

## Summary
Following pre-procecssing activities are done as part of this deliverable/notebook:
1. Preprocess by applying np.log on the following columns:
    1.1 income
    1.2 premium
2. Pre-process with MinMaxScalara() the following columns: 
    2.1 age_in_yrs, 
    2.2 application_underwriting_score
    2.3 no_of_premiums_paid
3. Pre-process Feature Categories with LabelEncoder and OneHotEncoder on the following columns: 
    3.1 sourcing_channel, 
    3.2 residence_area_type

## Part A

The base probability of receiving a premium on a policy without considering any incentive.

The probabilities predicted by the participants would be evaluated using AUC ROC score.

In [None]:
# testy_prob = clf.predict_proba(testx)[:,1] # Get just the probability of renewal (class in 1st-column)
# fpr_test, tpr_test, threshold_test = roc_curve(??, testy_prob, pos_label=1)
# roc_auc_test = auc(fpr_test, tpr_test)

In [None]:
testy = clf.predict(testx)
testy_prob = pd.DataFrame(testy_prob)[1].round(2)

In [None]:
finaldf = pd.concat([iddf.to_frame(), testy_prob],
                    axis=1)
finaldf.head()

## Part  B

The monthly incentives you will provide on each policy to maximize the net revenue based on the provided formulae in the problem statement

In [None]:
# ??