# 加载数据集

In [2]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import warnings
import lightgbm as lgb

from collections import Counter
from sklearn import preprocessing
import matplotlib.pyplot as plt 

from sklearn.feature_selection import SelectKBest,chi2,RFE,SelectFromModel
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score,roc_auc_score,accuracy_score,roc_curve, plot_roc_curve
from contextlib import contextmanager

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

plt.rc("font", size=14)
warnings.simplefilter(action='ignore')


path = 'C:/data/'

def reduce_mem_usage(data):
    start_mem = data.memory_usage().sum() / 1024**2
    
    print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2

    print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return data

In [3]:
data_app = pd.read_csv("C:/data/processing_data/data_app.csv")
# data_app = reduce_mem_usage(data_app)

In [4]:
data_app.shape

(307511, 222)

In [5]:
all_data = pd.read_csv('C:/data/processing_data/all_data.csv')
# all_data = reduce_mem_usage(all_data)

In [6]:
all_data.shape

(307511, 462)

In [7]:
select_data = pd.read_csv("C:/data/processing_data/feature_select_combination_data.csv")
# select_data =  reduce_mem_usage(select_data)pd

In [8]:
select_data.shape

(307511, 141)

# 模型定义

In [30]:
start1 = time.time()

    
X_train =  data_app.drop(columns = ['TARGET', 'SK_ID_CURR'])
y_train =  np.array(data_app['TARGET'].astype(np.int32)).reshape((-1, ))
LR = LogisticRegression(random_state=42, solver='sag',n_jobs=-1)


LR = LR.fit(X_train,y_train)
       
end1 = time.time()
print("消耗时间：", end1 - start1)

消耗时间： 55.97522974014282


In [20]:
def model_process(data):

    
    if "SK_ID_CURR" in data:
        X_train =  data.drop(columns = [ 'SK_ID_CURR'])
    if "TARGET" in data:
         X_train =  data.drop(columns = [ 'TARGET'])
    y_train =  np.array(data['TARGET'].astype(np.int32)).reshape((-1, ))
    X_train = X_train.to_numpy()
    
    # LR
    LR = LogisticRegression(random_state=42, solver='sag',n_jobs=-1)
    
    # GaussianNB
    gnb = GaussianNB()
    
    
    # DNN
    dnn = MLPClassifier(hidden_layer_sizes=(400,),activation='relu',max_iter=300, alpha=0.01,learning_rate_init = 0.01, learning_rate= 'constant')
    
    # 决策树
    clf=DecisionTreeClassifier(random_state=0)
    
    # 随机森林
    rfc=RandomForestClassifier(n_estimators=100)
      
    # 集成学习 adaBoost
    ada = AdaBoostClassifier(n_estimators=100, random_state=0)
    
    # gbm
    gbm = lgb.LGBMClassifier(
            boosting_type='gbdt',
            num_leaves=47,
            max_depth=8,
            learning_rate=0.072283,
            n_estimators= 750,
            objective='binary',
            class_weight=None,
            min_split_gain=0.0222415,
            min_child_weight=60,
            min_child_samples=485,
            subsample=0.6195545,
            subsample_freq=1,
            colsample_bytree=0.60017128,
            reg_alpha=0.5969339,
            reg_lambda=0.7364944,
            random_state=42,
            n_jobs=-1
            
        )
    
    # LR
    print("=============================逻辑回归================================")
    start1 = time.time()
    for i in range(10):
        LR = LR.fit(X_train,y_train)
    end1 = time.time()
    time1 = (end1 - start1)/10
    print("消耗时间：", time1)
    print("=================================================================")
    
    
    # GaussianNB
    print("=============================GaussianNB================================")
    start2 = time.time()
    for i in range(10):
        gnb = gnb.fit(X_train,y_train)
    end2 = time.time()
    time2 = (end2 - start2)/10
    print("消耗时间：", time2)
    print("=================================================================")
    

     # DNN
    print("=============================DNN================================")
    start3 = time.time()
    for i in range(10):
        dnn = dnn.fit(X_train,y_train)
    end3 = time.time()
    time3 = (end3 - start3)/10
    print("消耗时间：", time3)
    print("=================================================================")
    
     # Decision Tree
    print("=============================Decision Tree================================")
    start4 = time.time()
    for i in range(10):
        clf = clf.fit(X_train,y_train)
    end4 = time.time()
    time4 = (end4 - start4)/10
    print("消耗时间：", time4)
    print("=================================================================")
    
    
     # Random Forest
    print("=============================Random Forest================================")
    start5 = time.time()
    for i in range(10):
        rfc = rfc.fit(X_train,y_train)
    end5 = time.time()
    time5 = (end5 - start5)/10
    print("消耗时间：", time1)
    print("=================================================================")
    
    
    # AdaBoost
    print("=============================AdaBoost================================")
    start6 = time.time()
    for i in range(10):
        ada = ada.fit(X_train,y_train)
    end6 = time.time()
    time6 = (end6 - start6)/10
    print("消耗时间：", time6)
    print("=================================================================")
    
    
    # gbm
    print("=============================GBM================================")
    start7 = time.time()
    for i in range(10):
        gbm = gbm.fit(X_train,y_train)
    end7 = time.time()
    time7 = (end7 - start7)/10
    print("消耗时间：", time7)
    print("=================================================================")

    

# 性能比较

In [25]:
model_process(data_app)

消耗时间： 50.92211031913757
消耗时间： 1.0448870658874512
消耗时间： 419.798814535141
消耗时间： 30.33236789703369
消耗时间： 117.50469017028809
消耗时间： 125.51784896850586
消耗时间： 19.938085556030273


In [26]:
model_process(all_data)

消耗时间： 102.38911294937134
消耗时间： 2.113762378692627
消耗时间： 738.1093595027924
消耗时间： 176.190279006958
消耗时间： 369.92705941200256
消耗时间： 461.2255494594574
消耗时间： 61.41725468635559


In [27]:
model_process(select_data)

消耗时间： 34.4935200214386
消耗时间： 0.5989737510681152
消耗时间： 95.23218250274658
消耗时间： 68.33688998222351
消耗时间： 262.07338094711304
消耗时间： 214.58651304244995
消耗时间： 26.832836627960205


In [28]:
start1 = time.time()



In [29]:
end1 = time.time()
time1 = (end1 - start1)/10

In [30]:
print(time1)

0.8046187877655029


In [62]:
costtime = pd.DataFrame(columns = ["data_app", "all_data","select_data"])

In [66]:
costtime.loc[0,"data_app"] = 1

In [67]:
for i in range(10):
    

Unnamed: 0,data_app,all_data,select_data
0,1,,
