In [33]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")


In [39]:
data_train_black = pd.read_csv("20190814_black.txt")
data_train_white = pd.read_csv("20190814_white.txt")

data = pd.concat([data_train_black,data_train_white])
data = data.drop_duplicates()
data = data.reset_index(drop=True)

banana = data[data['label'] == 1].describe()
banana
# data = data[data['label'] == 1]

Unnamed: 0,all_link_num,all_link_feature,internal_link_num,external_link_num,null_link_num,internal_css_feature,external_css_feature,url_num,url_domain_suffix,title_coding,title_keyword,title_length,icp,iframe,label
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,62.755814,0.0,57.232558,5.523256,27.674419,2.372093,0.0,0.348837,0.011628,0.0,0.77907,0.953488,0.77907,0.534884,1.0
std,7.506759,0.0,9.615765,5.239503,12.204572,1.601812,0.0,0.479398,0.107833,0.0,0.417307,0.211825,0.417307,0.501707,0.0
min,44.0,0.0,12.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,59.0,0.0,54.0,3.0,17.5,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
50%,61.0,0.0,55.0,5.0,33.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
75%,65.75,0.0,57.75,7.0,38.5,3.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
max,95.0,0.0,92.0,48.0,41.0,12.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [35]:
X = data.drop('label', axis=1)
y = data['label']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
    
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)

for train, test in sss.split(original_Xtrain, original_ytrain):
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
    best_est = rand_log_reg.best_estimator_
    prediction = best_est.predict(original_Xtrain[test])
    
    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
    
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)

Train: [   18    19    20 ... 27823 27824 27825] Test: [   0    1    2 ... 5631 5632 5633]
Train: [    0     1     2 ... 27823 27824 27825] Test: [   18    19    20 ... 11179 11180 11181]
Train: [    0     1     2 ... 27823 27824 27825] Test: [   35    36    37 ... 16727 16728 16729]
Train: [    0     1     2 ... 27823 27824 27825] Test: [   52    53    54 ... 22275 22276 22277]
Train: [    0     1     2 ... 22275 22276 22277] Test: [   69    70    71 ... 27823 27824 27825]
---------------------------------------------------------------------------------------------------------------------------------------

accuracy: 0.9555290459277288
precision: 0.06361624430611773
recall: 0.9395604395604396
f1: 0.11891406261208537
---------------------------------------------------------------------------------------------------------------------------------------


In [36]:
print(best_est)

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [37]:
from sklearn.externals import joblib
joblib.dump(best_est, "20190829.m")

['20190829.m']