## Problem level approach

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score as roc_auc, accuracy_score as acc
from sklearn.ensemble import RandomForestClassifier as RFC

seed = 42

**Standard classification model for the problem**

In [2]:
def rfc(train_X, train_Y, validation_X, validation_Y, seed=42):
    
    rfc = RFC(n_estimators=100, random_state=seed)
    rfc = rfc.fit(train_X, train_Y)

    print("Accuracy Score: {0:.2f}".format(acc(validation_Y, rfc.predict(validation_X))))
    print("ROC-AUC: {0:.2f}".format(roc_auc(validation_Y, rfc.predict(validation_X))))

**Read data**

In [3]:
train = pd.read_csv('./train_ZoGVYWq.csv')

print('Unbalanced ratio: {}:1'.format(int(round(len(train[train.renewal==1])/len(train[train.renewal==0])))))

Unbalanced ratio: 15:1


**Data wrangling**

In [4]:
train.columns = train.columns.str.replace('-','_')
train.fillna(train.mean().astype(int), inplace=True)

train['n_unpaid'] = train.Count_3_6_months_late +train.Count_6_12_months_late + train.Count_more_than_12_months_late

train = pd.merge(
    train[train.columns[~train.columns.isin(train.select_dtypes(object))]], 
    pd.get_dummies(train.select_dtypes(object)), 
    left_index=True, right_index=True
)

**Train and validation split**

In [5]:
train_X, validation_X, train_Y, validation_Y = train_test_split(train.drop(['renewal'], axis=1),
                                               train['renewal'],
                                               train_size = .8,
                                               )



**Scale dataset**

In [6]:
scaler = MinMaxScaler()

# the independent variable doesn't need scaling since is comprised between 0 and 1
train_Y = train_Y.values.reshape((len(train_Y),))
validation_Y = validation_Y.values.reshape((len(validation_Y),))

# scale values to range 0-1
train_X = scaler.fit_transform(train_X)

# don't `fit` the validation set, it must be replicated from the training dataset scaler
validation_X = scaler.transform(validation_X)

  return self.partial_fit(X, y)


**Results**

In [7]:
rfc(train_X, train_Y, validation_X, validation_Y)

Accuracy Score: 0.94
ROC-AUC: 0.55


**Attempt to segment the dataset**

In [10]:
# split the dataset once again
train_X, validation_X, train_Y, validation_Y = train_test_split(train.drop(['renewal'], axis=1),
                                               train['renewal'],
                                               train_size = .8,
                                               )
# determine the feature range for the predictor variables
segment = train_X.loc[list(train_Y[train_Y==0].index),:].describe()

# segment the training set based on the max-min limits
for column in segment.columns:
    train_X = train_X.loc[(train_X[column]<=segment.loc['max', column]) & (train_X[column]>=segment.loc['min', column])]

train_Y = train_Y.loc[train_X.index]

#scale data
scaler = MinMaxScaler()

train_Y = train_Y.values.reshape((len(train_Y),))
validation_Y = validation_Y.values.reshape((len(validation_Y),))

# scale values to range 0-1
train_X = scaler.fit_transform(train_X)
validation_X = scaler.transform(validation_X)

  return self.partial_fit(X, y)


**Results**

In [11]:
rfc(train_X, train_Y, validation_X, validation_Y)

Accuracy Score: 0.94
ROC-AUC: 0.56
