In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Training data

In [2]:
data = pd.read_csv('aps_failure_training_set.csv')

In [3]:
data.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [4]:
data.shape

(60000, 171)

In [5]:
data.dropna(inplace=True)

In [6]:
for i in data.columns[2:]:
    data[i].replace('na',0,inplace=True)

In [7]:
for i in data.columns[1:]:
    data[i]=data[i].astype('float')

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [9]:
X_train = scaler.fit_transform(data.drop('class',axis=1))

In [10]:
data['class'] = data['class'].map({'neg':'0','pos':'1'})
data['class'] = data['class'].dropna()
data['class'] = data['class'].astype('int')

In [11]:
y_train = data['class']

Test data

In [12]:
test_data = pd.read_csv('app_failure_test.csv')

In [13]:
test_data.head(2)

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,60,0,20,12,0,0,0,0,0,...,1098,138,412,654,78,88,0,0,0,0
1,neg,82,0,68,40,0,0,0,0,0,...,1068,276,1620,116,86,462,0,0,0,0


In [14]:
test_data.shape

(16000, 171)

In [15]:
test_data.dropna(inplace=True)

In [16]:
for i in test_data.columns[2:]:
    test_data[i].replace('na',0,inplace=True)

In [17]:
for i in test_data.columns[1:]:
    test_data[i]=test_data[i].astype('float')

In [18]:
X_test = scaler.transform(test_data.drop('class',axis=1))

In [19]:
test_data['class'] = test_data['class'].map({'neg':'0','pos':'1'})
test_data['class'] = test_data['class'].dropna()
test_data['class'] = test_data['class'].astype('int')

In [20]:
y_test = test_data['class']

Shape of our train and test data

In [21]:
X_train.shape

(60000, 170)

In [22]:
y_train.shape

(60000,)

In [23]:
X_test.shape

(16000, 170)

In [24]:
y_test.shape

(16000,)

In [25]:
import seaborn as sns
import matplotlib.pyplot as plt

# Logistic regression 

In [26]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()

In [27]:
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,roc_curve

In [28]:
logistic_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [29]:
y_predict_log = logistic_model.predict(X_test)

In [30]:
accuracy_score(y_test,y_predict_log)

0.9898125

Hyperparamter tuning

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [32]:
params = {
    'penalty' :['l1', 'elasticnet'],
    'max_iter': [120,130],
    'solver':['lbfgs', 'liblinear'],
    'verbose':range(3,5)
    
}

In [33]:
log_model_bp = RandomizedSearchCV(logistic_model,params,random_state=7,n_jobs=-1)

In [None]:
log_model_bp.fit(X_train,y_train)

In [None]:
log_model_bp.best_params_

In [None]:
log_model_best = LogisticRegression(verbose=  4, solver = 'liblinear' , penalty = 'l1' , max_iter = 130)

In [None]:
log_model_best.fit(X_train,y_train)

In [None]:
y_predict_best = log_model_best.predict(X_test)

In [None]:
accuracy_score(y_test,y_predict_best)

roc_auc_score

In [None]:
roc_auc_score(y_test,y_predict_best)

In [None]:
pred=[]
for model in [log_model_best]:
    pred.append(pd.Series(model.predict_proba(X_test)[:,1]))
final_prediction=pd.concat(pred,axis=1).mean(axis=1)
print('Ensemble test roc-auc: {}'.format(roc_auc_score(y_test,final_prediction)))

In [None]:
fpr , tpr  , thresholds = roc_curve(y_test,final_prediction)
thresholds

In [None]:
from sklearn.metrics import accuracy_score
accuracy_ls = []
for thres in thresholds:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))
    
accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)],
                        axis=1)
accuracy_ls.columns = ['thresholds', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr,tpr)

# classification report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_predict_best))

# SVM

In [28]:
from sklearn.svm import SVC

In [29]:
model = SVC()

In [30]:
model.fit(X_train,y_train)

SVC()

In [31]:
y_predict = model.predict(X_test)

In [32]:
accuracy_score(y_test,y_predict)

0.98375

Hyperparameter tuning

In [33]:
param_grid = {'C': [ 1, 10,], 
              'gamma': [ 0.1, 0.01, 0.001],
              'kernel': ['rbf','linear']} 

In [36]:
model_rand_svm = RandomizedSearchCV(model,param_grid,n_jobs=-1)

In [None]:
model_rand_svm.fit(X_train,y_train)

In [None]:
model_rand_svm.best_params_

In [None]:
model_best_svm = SVC(C = ,gamma= ,kernel = )

In [None]:
model_best_svm.fit(X_train,y_train)

In [None]:
y_predict_best_svm = model_best_svm.predict(X_test)

In [None]:
accuracy_score(y_test,y_predict_best_svm)

In [None]:
roc_auc_score

In [None]:
roc_auc_score(y_test,y_predict_best)

In [None]:
pred=[]
for model in [log_model_best]:
    pred.append(pd.Series(model.predict_proba(X_test)[:,1]))
final_prediction=pd.concat(pred,axis=1).mean(axis=1)
print('Ensemble test roc-auc: {}'.format(roc_auc_score(y_test,final_prediction)))

In [None]:
fpr , tpr  , thresholds = roc_curve(y_test,final_prediction)
thresholds

In [None]:
from sklearn.metrics import accuracy_score
accuracy_ls = []
for thres in thresholds:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))
    
accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)],
                        axis=1)
accuracy_ls.columns = ['thresholds', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()

In [None]:
classification report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_predict_best))