In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PowerTransformer

In [3]:
df_val = pd.read_csv(r'D:\ML\Finding_defect\Files\val.csv')
df_val.describe()

Unnamed: 0.1,Unnamed: 0,date,ns,nd,nf,entrophy,la,ld,lt,fix,ndev,age,nuc,exp,rexp,sexp,bug
count,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0,7621.0
mean,14593.753313,1309232000.0,1.09382,1.415431,2.297205,0.4699,55.855268,97.832568,3173.971264,0.275817,16.301666,41.839818,268.595066,1752.087653,25.456921,104.664349,0.463587
std,12938.301879,215116800.0,0.424313,2.834192,10.25545,3.379968,526.476021,4192.685432,10526.874257,0.446954,17.639223,132.280917,1097.734906,2254.96369,44.709164,147.104422,0.498705
min,1.0,914399600.0,1.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
25%,4099.0,1131658000.0,1.0,1.0,1.0,0.0,2.0,1.0,684.0,0.0,4.0,0.54753,27.0,160.0,5.300796,15.0,0.0
50%,11216.0,1311849000.0,1.0,1.0,1.0,0.0,6.0,3.0,1580.0,0.0,10.0,5.122199,94.0,828.0,13.331529,48.0,0.0
75%,21187.0,1466260000.0,1.0,1.0,2.0,0.430827,26.0,13.0,3401.0,1.0,22.0,30.271829,258.0,2485.0,28.51238,132.0,1.0
max,48726.0,1665283000.0,8.0,108.0,435.0,142.740834,27241.0,363587.0,388666.0,1.0,209.0,3741.089051,52411.0,12172.0,1088.711839,1240.0,1.0


In [4]:
df_val.drop(df_val.columns[0:3], axis =1, inplace= True)


In [5]:
for i in df_val.columns:
    percentile25 = df_val[i].quantile(0.25)
    percentile75 = df_val[i].quantile(0.75)
    iqr = percentile75 - percentile25
    upper_limit = percentile75 + 6 * iqr
    lower_limit = percentile25 - 6 * iqr
    df_val = df_val.drop(df_val.loc[(df_val[i] > upper_limit) | (df_val[i] < lower_limit)].index)

In [6]:
y_val = df_val['bug']
df_val.drop(['bug'], axis=1, inplace= True)
X_val = df_val
X_val.describe()

Unnamed: 0,ns,nd,nf,entrophy,la,ld,lt,fix,ndev,age,nuc,exp,rexp,sexp
count,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0,4635.0
mean,1.0,1.0,1.0,0.0,7.7137,3.681769,1743.774973,0.302697,14.597411,19.128624,151.547357,1670.037109,18.700501,85.343905
std,0.0,0.0,0.0,0.0,11.211787,5.257085,1682.946826,0.459475,14.38207,34.247944,204.856578,2226.024574,22.16518,114.889936
min,1.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,0.0,1.0,1.0,557.0,0.0,4.0,0.185509,22.0,144.0,4.319703,12.0
50%,1.0,1.0,1.0,0.0,3.0,2.0,1180.0,0.0,10.0,3.31412,72.0,768.0,11.424821,41.0
75%,1.0,1.0,1.0,0.0,9.0,4.0,2425.0,1.0,21.0,20.454051,194.0,2327.0,22.616153,104.0
max,1.0,1.0,1.0,0.0,78.0,29.0,13351.0,1.0,91.0,185.389363,1253.0,12172.0,137.097634,674.0


In [7]:
pt = PowerTransformer()
pX_val = pt.fit_transform(X_val)

In [8]:
sc = StandardScaler()
zX_val = sc.fit_transform(pX_val)

In [9]:
model = LogisticRegression(class_weight='balanced', random_state=42, max_iter=10000, solver='saga')

In [10]:
param_grid = {
    'C' : [0.01, 0.1, 1],
    'penalty' : ['l1', 'l2','elasticnet'],
    'l1_ratio': [0.2, 0.5, 0.8]
}

In [11]:
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc',
}

In [12]:
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, scoring = scoring, cv = 5, refit = 'roc_auc')
grid_search.fit(zX_val, y_val)



In [13]:
best_params = grid_search.best_params_
best_params

{'C': 0.01, 'l1_ratio': 0.2, 'penalty': 'l1'}

In [14]:
best_score = grid_search.best_score_
best_score

np.float64(0.7030849520342803)