In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb1
from xgboost import XGBClassifier

In [2]:
train_data=pd.read_csv("train_s3TEQDk.csv")
test_data=pd.read_csv("test_mSzZ8RL.csv")

In [3]:
X_train = train_data.drop(['ID','Is_Lead'], axis=1)
Y_train = train_data['Is_Lead'].values
X_test_data=test_data.drop(['ID'],axis=1)

In [4]:
X_train,X_test,y_train,y_test=train_test_split(X_train,Y_train,test_size = 0.2,random_state=42)

In [5]:
X_train['Credit_Product'].fillna('Missing',inplace=True)
X_test['Credit_Product'].fillna('Missing',inplace=True)
X_test_data['Credit_Product'].fillna('Missing',inplace=True)

In [6]:
#Doing Label Encoding to our Categeorical features for train_data
label_encoder=LabelEncoder()
X_train['Gender']=label_encoder.fit_transform(X_train['Gender'])
X_test['Gender']=label_encoder.fit_transform(X_test['Gender'])
X_test_data['Gender']=label_encoder.fit_transform(X_test_data['Gender'])

X_train['Region_Code']=label_encoder.fit_transform(X_train['Region_Code'])
X_test['Region_Code']=label_encoder.fit_transform(X_test['Region_Code'])
X_test_data['Region_Code']=label_encoder.fit_transform(X_test_data['Region_Code'])

X_train['Occupation']=label_encoder.fit_transform(X_train['Occupation'])
X_test['Occupation']=label_encoder.fit_transform(X_test['Occupation'])
X_test_data['Occupation']=label_encoder.fit_transform(X_test_data['Occupation'])

X_train['Channel_Code']=label_encoder.fit_transform(X_train['Channel_Code'])
X_test['Channel_Code']=label_encoder.fit_transform(X_test['Channel_Code'])
X_test_data['Channel_Code']=label_encoder.fit_transform(X_test_data['Channel_Code'])

X_train['Credit_Product']=label_encoder.fit_transform(X_train['Credit_Product'])
X_test['Credit_Product']=label_encoder.fit_transform(X_test['Credit_Product'])
X_test_data['Credit_Product']=label_encoder.fit_transform(X_test_data['Credit_Product'])

X_train['Is_Active']=label_encoder.fit_transform(X_train['Is_Active'])
X_test['Is_Active']=label_encoder.fit_transform(X_test['Is_Active'])
X_test_data['Is_Active']=label_encoder.fit_transform(X_test_data['Is_Active'])

In [7]:
scaling=MinMaxScaler()
X_train[['Age','Vintage','Avg_Account_Balance']]=scaling.fit_transform(X_train[['Age','Vintage','Avg_Account_Balance']])
X_test[['Age','Vintage','Avg_Account_Balance']]=scaling.fit_transform(X_test[['Age','Vintage','Avg_Account_Balance']])
X_test_data[['Age','Vintage','Avg_Account_Balance']]=scaling.fit_transform(X_test_data[['Age','Vintage','Avg_Account_Balance']])

#### a.LogisticRegression

In [None]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)
pred_prob1 = model1.predict_proba(X_test)
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
print(auc_score1)

0.7256740410848981


In [None]:
predictions_on_test_data=model1.predict(X_test_data)
submission=pd.read_csv("sample_submission_eyYijxG.csv")
submission['Is_Lead']=predictions_on_test_data
submission.to_csv('submissions_hackathon/LE_mms_logitic_regression_submission.csv',index=False)

#### b.Random Forest Classifier

In [None]:
alpha=[10,50,100,500,1000,2000]
cv_log_error_array=[]
for i in alpha:
    model2=RandomForestClassifier(n_estimators=i,random_state=24,n_jobs=-1)
    model2.fit(X_train,y_train)
    pred_prob2 = model2.predict_proba(X_test)
    auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])
    print(i)
    print("Roc_score=",auc_score2)

10
Roc_score= 0.8335547550267097
50
Roc_score= 0.8495813765275008
100
Roc_score= 0.8522954765343789
500
Roc_score= 0.8543893516289519
1000
Roc_score= 0.854936619895912
2000
Roc_score= 0.8550563587591994


In [None]:
model2 = RandomForestClassifier(n_estimators=2000)
model2.fit(X_train, y_train)
pred_prob2 = model2.predict_proba(X_test)
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])
print(auc_score2)

0.8547065352958708


In [None]:
predictions_on_test_data2=model2.predict(X_test_data)
submission=pd.read_csv("sample_submission_eyYijxG.csv")
submission['Is_Lead']=predictions_on_test_data2
submission.to_csv('submissions_hackathon/LE_mms_Random_Forest_submission.csv',index=False)

#### XGBoost

In [None]:
model3 =XGBClassifier(subsample=1,n_estimators=1000,min_child_weight=7,max_depth=5,learning_rate=0.01,
                      colsample_bytree=1)
model3.fit(X_train,y_train)
pred_prob3 = model3.predict_proba(X_test)
auc_score3 = roc_auc_score(y_test, pred_prob3[:,1])
print(auc_score3)

0.8714234782309651


In [None]:
predictions_on_test_data2=model3.predict(X_test_data)
submission=pd.read_csv("sample_submission_eyYijxG.csv")
submission['Is_Lead']=predictions_on_test_data2
submission.to_csv('submissions_hackathon/XGB_MMS_LE_submission.csv',index=False)

#### MLP

In [8]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import PReLU,Dropout
from tensorflow.keras import Sequential
from keras.callbacks import Callback

In [9]:
model = Sequential()
model.add(Dense(400, input_dim = X_train.shape[1], kernel_initializer = 'he_normal'))
model.add(PReLU())
model.add(BatchNormalization())
model.add(Dropout(0.4))
model.add(Dense(200, kernel_initializer='he_normal'))
model.add(PReLU())
model.add(BatchNormalization())    
model.add(Dropout(0.2))
model.add(Dense(50, kernel_initializer = 'he_normal'))
model.add(PReLU())
model.add(BatchNormalization())    
model.add(Dropout(0.2))
model.add(Dense(1, kernel_initializer = 'he_normal'))
model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=[tf.keras.metrics.AUC(from_logits=True)])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 400)               4000      
_________________________________________________________________
p_re_lu (PReLU)              (None, 400)               400       
_________________________________________________________________
batch_normalization (BatchNo (None, 400)               1600      
_________________________________________________________________
dropout (Dropout)            (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               80200     
_________________________________________________________________
p_re_lu_1 (PReLU)            (None, 200)               200       
_________________________________________________________________
batch_normalization_1 (Batch (None, 200)               8

In [11]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f2be0100090>

In [13]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2b90618ed0>

In [14]:
predictions_on_test_data3=model.predict(X_test_data)
submission=pd.read_csv("sample_submission_eyYijxG.csv")
submission['Is_Lead']=predictions_on_test_data3
submission.to_csv('MLP2_MMS_LE_submission.csv',index=False)

In [None]:
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=[auroc])