# **Library**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model,metrics
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# **Explore Data**

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.info()

In [None]:
train_df.head()

In [None]:
print(train_df.shape)
print(train_df.isnull().sum())

In [None]:
train_df.dropna(inplace=True)

# **Feature Engineering**

In [None]:
corr_matrix = train_df.corr()
corr_matrix

In [None]:
claim_matrix = list(corr_matrix['claim'])
claim_dict = {}
co = 0
for i in claim_matrix:
    claim_dict[co] = abs(i)
    co += 1
claim_dict

In [None]:
claim_dict = sorted(claim_dict.items(), key=lambda item: item[1])
claim_dict.reverse()
claim_dict

In [None]:
features = ['f34','f57','f8','f45','f21','f52','f35','f36','f47','f71','f3','f62','f95','f32','f48','f79','f50','f31','f73','f2','f102','f23','f30','f24','f106','f46']
X = train_df[features]
y = train_df['claim']
X_pred = test_df[features]

# **Model**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def plot_roc(clf, X_train, y_train, X_test, y_test):
  
  from sklearn.metrics import roc_curve, auc, roc_auc_score
  import matplotlib.pyplot as plt

  clf_model = clf.fit(X_train, y_train)
  y_score_train = clf_model.predict_proba(X_train)
  fpr_train, tpr_train, _ = roc_curve(y_train,  y_score_train[:,1])
  auc_train = roc_auc_score(y_train, y_score_train[:,1])
  plt.plot(fpr_train,tpr_train, color='red', label='train , auc='+str(auc_train))

  y_score_test = clf_model.predict_proba(X_test)
  fpr_test, tpr_test, _ = roc_curve(y_test,  y_score_test[:,1])
  auc_test = roc_auc_score(y_test, y_score_test[:,1])
  plt.plot(fpr_test,tpr_test, color='Blue', label='test , auc='+str(auc_test))

  plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
  plt.legend(loc=4)
  plt.show()

In [None]:
logreg = LogisticRegression(solver='liblinear', class_weight='balanced')
plot_roc(logreg, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
plot_roc(dt_clf, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.svm import SVC
svc_clf = SVC(kernel='linear', gamma='auto')
plot_roc(svc_clf, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(5)
plot_roc(knn_clf, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=4, random_state=0)
plot_roc(rf_clf, X_train, y_train, X_test, y_test)

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc',
                              learning_rate =0.01,
                              n_estimators=100,
                              max_depth=3,
                              gamma=0.0,
                              colsample_bytree=0.6)
plot_roc(xgb_clf, X_train, y_train, X_test, y_test)

# **Submission**

In [None]:
X_pred.fillna(X_pred.mean(), inplace=True)
y_pred = rf_clf.predict(X_pred)
y_pred

In [None]:
submission = pd.DataFrame({
    "id": test_data["id"],
    "claim": y_pred
})
submission

In [None]:
submission.to_csv('submission.csv', index=False)