## Data introduction and import necessary Libraries

**Pixel Resolution:** 30m * 30m<br>
**Target:** Target data set<br>
**Point of Interest (POI):** POI Kernel Density, 
- POI_Sel, BandWidth range: 250 m - 2500 m, interval = 250 m<br>

**Road Network (RN):** Road Kernel Density
- BandWidth range: 250 m - 2500 m, interval = 250 m<br>

**NTL:** Time,2019/03; NPP-VIIRS, DNB<br>
**XM_Boundary:** mask of all the layers<br>
**Train_Test:** 
- Train, data located in Off-island area of Xiamen, label = 0
- Test,  data located in Island area of Xiamen, label = 1

In [1]:
import math
import json
import copy
import time
import numpy as np
import pandas as pd
from sl_1 import * # my custom module 

In [3]:
# Extract and generate headers
# extrat_begin('Target', 'begin')

## read clean data and split it

### read data

In [4]:
clean_data = pd.read_csv(r'data\ly_df_clean.csv')
clean_data.head()

Unnamed: 0,poi_all_250,poi_all_500,poi_all_750,poi_all_1000,poi_all_1250,poi_all_1500,poi_all_1750,poi_all_2000,poi_all_2250,poi_all_2500,...,RN_2000,RN_2250,RN_2500,NTL,lng,lat,Target,Train_Test,XM_Boundary,ID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002976,0.020894,...,0.0,0.0,0.0,-9999.0,118.011595,24.907131,0.0,0.0,1.0,430
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001634,0.018002,...,0.0,0.0,0.0,-9999.0,118.011892,24.907129,0.0,0.0,1.0,431
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005609,0.025637,...,0.0,0.0,0.0,-9999.0,118.011296,24.906862,0.0,0.0,1.0,2361
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003711,0.022329,...,0.0,0.0,0.0,-9999.0,118.011593,24.90686,0.0,0.0,1.0,2362
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002184,0.019295,...,0.0,0.0,0.0,-9999.0,118.01189,24.906858,0.0,0.0,1.0,2363


In [3]:
len(clean_data) # Sample size: 1890571

1890571

### spli train and test
- Train, data located in Off-island area of Xiamen, label = 0
- Test,  data located in Island area of Xiamen, label = 1

In [5]:
Train = clean_data[clean_data['Train_Test'] == 0]
Test = clean_data[clean_data['Train_Test'] == 1]
# Train.to_csv(r'data\Train.csv')
# Test.to_csv(r'data\Test.csv')
# Train.head()
# Test.head()

In [6]:
len(Test)

171791

## Sensitivity analysis of bandwidth

In [3]:
def get_name(name_pref):
    '''
    input: name_pref, name prefixs of lyer name list
    output: a lyer name list
    '''
    ly_name = [name_pref + str(i) for i in range(250,2750,250)]
    return ly_name
# get_name('poi_all_')  # test example 1
get_name('RN_') # test example 2

['RN_250',
 'RN_500',
 'RN_750',
 'RN_1000',
 'RN_1250',
 'RN_1500',
 'RN_1750',
 'RN_2000',
 'RN_2250',
 'RN_2500']

In [2]:
from sklearn.model_selection import cross_val_score

def get_name(name_pref):
    '''
    input: name_pref, name prefixs of lyer name list
    output: a lyer name list
    '''
    ly_name = [name_pref + str(i) for i in range(250,2750,250)]
    return ly_name
# get_name('poi_all_')  # test example 1
get_name('RN_') # test example 2
# get_name('poi_all_') + get_name('RN_') + ['Target','Train_Test','XM_Boundary']

def opti_band(names, clf):
    '''input: names, a list made up of different band width name
       output: a string, a band width name with max score
    '''
    score_list = []
    for name in names:
        X = np.array(Train[name]).reshape(-1, 1)
        y = Train['Target']
        scores = cross_val_score(clf, X, y, cv=5)
        score_list.append(scores.mean())
    return score_list

def select_name(names,clf):
    A = {}
    scores = opti_band(names, clf)
    ind = scores.index(max(scores))
    name = names[ind]
    A[name] = scores
    return A
# input_name = 'poi_all_' # 
# names = get_name(input_name)[0:2]
# name = select_name(names)
# name

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier

names = ["LogisticRegression",
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "GradientBoosting"]

classifiers = [
    LogisticRegression(random_state=0, solver='lbfgs', max_iter =150),
    DecisionTreeClassifier(random_state=0, max_depth=2),
    RandomForestClassifier(n_estimators=10, random_state=0, max_depth=2),
    AdaBoostClassifier(n_estimators=10, random_state=0),
    GradientBoostingClassifier(n_estimators=100, random_state=0)]

list1 = []
for name, clf in list(zip(names, classifiers))[5:]:
    for input_name in ['poi_sel_', 'RN_']:
        names = get_name(input_name)[0:8]
        name = select_name(names,clf)
        print(name)
        list1.append(name)

{'poi_sel_750': [0.8875387488088566, 0.890418669984753, 0.8918958601815452, 0.8893609002583232, 0.8668036183400524, 0.8549993470831492, 0.8625530037806393, 0.8576501071542906]}
{'RN_250': [0.8350212943631554, 0.8296034518019795, 0.8306128904054318, 0.8288098146822026, 0.8292048688247233, 0.8275618438799466, 0.8248471098005687, 0.8234868432588511]}
{'poi_sel_500': [0.8883125815544191, 0.8982155132961005, 0.8930321690121877, 0.8854570172655682, 0.8794562478978378, 0.8797227158978685, 0.872141768356191, 0.8647242889915333]}
{'RN_1250': [0.8396937826843999, 0.8455211435736503, 0.8511629173226114, 0.8537013465738028, 0.8548754306914967, 0.8526756166296728, 0.8524516187250724, 0.8503646761465044]}


In [9]:
score_list = [{'poi_sel_750': [0.8801062160559965,
   0.8867527770787383,
   0.8871036003829122,
   0.88501723456701,
   0.8810242833197899,
   0.878080327222498,
   0.8766112574295093,
   0.8748780477656201]},
 {'RN_1750': [0.8407899344127374,
   0.8488049048455742,
   0.8535896834056246,
   0.8559721743507032,
   0.856960657622183,
   0.8573423179653057,
   0.8577001285138349,
   0.8572247863462765]},
 {'poi_sel_500': [0.8780075974432119,
   0.9035198512662044,
   0.8835318123221562,
   0.8625925726315403,
   0.8516888788466208,
   0.8693148178666711,
   0.8469715763557708,
   0.8376492541726485]},
 {'RN_750': [0.8460674913289254,
   0.8527024349863552,
   0.8588882041349454,
   0.846455513089673,
   0.8438501720565036,
   0.8429728054711323,
   0.8449323214324895,
   0.8581172937545791]},
 {'poi_sel_500': [0.8786912204900327,
   0.9036239949351265,
   0.8849328071571725,
   0.8671527731376203,
   0.8529327826371931,
   0.8689610771140028,
   0.8479333041995387,
   0.8418842339979002]},
 {'RN_750': [0.8453786294051039,
   0.8524132746047602,
   0.8594868808268341,
   0.8464223502623358,
   0.8451126929524561,
   0.8434830499358881,
   0.8455600924692419,
   0.8538759009698118]},
 {'poi_sel_750': [0.8875387488088566,
   0.890418669984753,
   0.8918958601815452,
   0.8893609002583232,
   0.8668036183400524,
   0.8549993470831492,
   0.8625530037806393,
   0.8576501071542906]},
 {'RN_250': [0.8350212943631554,
   0.8296034518019795,
   0.8306128904054318,
   0.8288098146822026,
   0.8292048688247233,
   0.8275618438799466,
   0.8248471098005687,
   0.8234868432588511]},
 {'poi_sel_500': [0.8883125815544191,
   0.8982155132961005,
   0.8930321690121877,
   0.8854570172655682,
   0.8794562478978378,
   0.8797227158978685,
   0.872141768356191,
   0.8647242889915333]},
 {'RN_1250': [0.8396937826843999,
   0.8455211435736503,
   0.8511629173226114,
   0.8537013465738028,
   0.8548754306914967,
   0.8526756166296728,
   0.8524516187250724,
   0.8503646761465044]}]

In [11]:
# score_list

## Ensemble methods
- RandomForest
- AdaBoost
- Gradient Tree Boosting
- Voting Classifier

### cross validation

In [8]:
import copy
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

names = ["LogisticRegression",
         "Naive Bayes", 
         "QDA",
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "GradientBoosting"]

params = [{'param': 'default'},
          {'param': 'default'},
          {'param': 'default'},
          {'max_depth': list(range(1, 6))},
          {'max_depth': list(range(1, 6))},
          {'learning_rate': [0.001,0.1,1]}, # 
          {'learning_rate': [0.001,0.1,1]}]

classifiers = [
    LogisticRegression(random_state=0, solver='lbfgs', max_iter =150),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(n_estimators=100, random_state=0),
    AdaBoostClassifier(n_estimators=100, random_state=0),
    GradientBoostingClassifier(n_estimators=100, random_state=0)]


In [6]:
X_Train =  np.array(Train[['poi_sel_250', 'RN_500','NTL']])
Y_Train = Train['Target'].copy()
X_Test = np.array(Test[['poi_sel_250', 'RN_500','NTL']])
Y_Test = Test['Target'].copy() # get sub dataframe

params_cv = [] # params selcted by cross validation
for name, params, clf in list(zip(names, params, classifiers)):
    print(name)
    # for "Decision Tree", "Random Forest", "AdaBoost", "GradientBoosting", conduct cross validation.
    if name in [ "Decision Tree", "Random Forest", "AdaBoost", "GradientBoosting"]:
        grid = GridSearchCV(clf, params, cv=10, iid=False)
        grid.fit(X_Train, Y_Train)
        # use the best estimator to compute the kernel density estimate
        clf_best = grid.best_estimator_
        if name in [ "Decision Tree", "Random Forest"]:
            params_cv.append(clf_best.max_depth)
        else:
            params_cv.append(clf_best.learning_rate)
    else:
        pass

LogisticRegression
Naive Bayes
QDA
Decision Tree
Random Forest
AdaBoost
GradientBoosting


In [7]:
params_cv

[2, 4, 1, 0.1]

### example 1 

In [18]:
import warnings
warnings.filterwarnings("ignore")

import copy
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

names = ["LogisticRegression",
         "Naive Bayes", 
         "QDA",
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "GradientBoosting"]

classifiers = [
    LogisticRegression(random_state=0, solver='lbfgs', max_iter =150),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(random_state=0, max_depth=2),
    RandomForestClassifier(n_estimators=100, random_state=0, max_depth=4),
    AdaBoostClassifier(n_estimators=100, random_state=0, learning_rate=1),
    GradientBoostingClassifier(n_estimators=100, random_state=0, learning_rate=0.1)]

In [22]:
X_Train =  np.array(Train[['poi_sel_250', 'RN_500','NTL']])
Y_Train = Train['Target'].copy()
X_Test = np.array(Test[['poi_sel_250', 'RN_500','NTL']])
Y_Test = Test['Target'].copy() # get sub dataframe

for name, clf in list(zip(names, classifiers)):
    clf.fit(X_Train, Y_Train)
    score = clf.score(X_Test, Y_Test) # overall_accuracy
    predictions = clf.predict(X_Test)
    f1 = f1_score(Y_Test, predictions, average='macro') # f1_score
    print('name',name,'overall_accuracy:', score,'f1_score',f1)

name LogisticRegression overall_accuracy: 0.8665180364512692 f1_score 0.8337140537687726
name Naive Bayes overall_accuracy: 0.8718500969200947 f1_score 0.8332470621988753
name QDA overall_accuracy: 0.8691258564185551 f1_score 0.8304654569234581
name Decision Tree overall_accuracy: 0.8479023930240815 f1_score 0.8164341698383304
name Random Forest overall_accuracy: 0.8751506190661908 f1_score 0.8421709530922891
name AdaBoost overall_accuracy: 0.8917987554644888 f1_score 0.8583320927696501
name GradientBoosting overall_accuracy: 0.8672864119773446 f1_score 0.830651426089562


### example2-reverse Train and test

In [23]:
X_Train =  np.array(Train[['poi_sel_250', 'RN_500','NTL']])
Y_Train = Train['Target'].copy()
X_Test = np.array(Test[['poi_sel_250', 'RN_500','NTL']])
Y_Test = Test['Target'].copy() # get sub dataframe

for name, clf in list(zip(names, classifiers)):
    clf.fit(X_Test, Y_Test)
    score = clf.score(X_Train, Y_Train) # overall_accuracy
    predictions = clf.predict(X_Train)
    f1 = f1_score(Y_Train, predictions, average='macro') # f1_score
    print('name',name,'overall_accuracy:', score,'f1_score',f1)

name LogisticRegression overall_accuracy: 0.8903710771593805 f1_score 0.8114116045903234
name Naive Bayes overall_accuracy: 0.8865549983127567 f1_score 0.774790872902696
name QDA overall_accuracy: 0.8845564877413049 f1_score 0.7746017106062489
name Decision Tree overall_accuracy: 0.850520718183828 f1_score 0.7913918960582623
name Random Forest overall_accuracy: 0.8913351330594957 f1_score 0.8298344998507784
name AdaBoost overall_accuracy: 0.8959413072062742 f1_score 0.8287294092126196
name GradientBoosting overall_accuracy: 0.8953757898043961 f1_score 0.8298787225599409


### write to CSV

In [11]:
import copy
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import f1_score
from itertools import combinations
def get_comb(input_list,n):
    '''input: input_list, a list.
       input: n，an integer; the number of combinations.
       output: a list, the combinations of input_list.
    '''
    comb_all = []
    m = 1
    while m<n+1:
        a = [list(i) for i in list(combinations(input_list, m))]
        comb_all += a
        m += 1
    return comb_all
# comb_all = get_comb([1,2,3], 3)
# comb_all

In [12]:
comb_all = get_comb(['poi_sel_250', 'RN_500','NTL'], 3)
comb_all

[['poi_sel_250'],
 ['RN_500'],
 ['NTL'],
 ['poi_sel_250', 'RN_500'],
 ['poi_sel_250', 'NTL'],
 ['RN_500', 'NTL'],
 ['poi_sel_250', 'RN_500', 'NTL']]

In [13]:
def clf_pre(data_name,names, classifiers):
    '''input: data_name,a list consist of the names or name of data to be predicted by different classfiers. 
       names: the name of different classifiers.
       classifiers: the classifiers to predict data.
    '''
    X_Train =  np.array(Train[data_name])
    Y_Train = Train['Target'].copy()
    X_Test = np.array(Test[data_name])
    Y_Test = Test['Target'].copy() # get sub dataframe

    score_list = []
    f1_list = []
    pred = []
    pred_proba = []
    for name, clf in list(zip(names, classifiers)):
        clf.fit(X_Train, Y_Train)
        score = clf.score(X_Test, Y_Test) # overall_accuracy
        predictions = clf.predict(X_Test)
        proba = clf.predict_proba(X_Test)
        f1 = f1_score(Y_Test, predictions, average='macro') # f1_score
        score_list.append(score)
        f1_list.append(f1)
        pred.append(predictions)
        pred_proba.append(proba)
        if len(data_name) > 0:
            dname = '_'.join(data_name)
        else:
            dname = data_name[0]
        print('data_name', dname, 'clf_name', name,'overall_accuracy:', score,'f1_score',f1)
    data1 = {'data_name': dname, 'clf_name': names, 
             'accuracy': score_list, 'f1_score': f1_list, 
             'pred': pred, 'pred_proba': pred_proba}
    DF1 = pd.DataFrame(data1)
    return DF1

def clf_pres(data_names,names, classifiers):
    '''input: data_name,a list consist of the names or name of data to be predicted by different classfiers. 
       names: the name of different classifiers.
       classifiers: the classifiers to predict data.
    '''
    list1 = []
    for data_name in data_names:
        DF1 = clf_pre(data_name,names, classifiers)
        list1.append(DF1)
    DF2 = pd.concat(list1)
    return DF2


In [14]:
data_names = get_comb(['poi_sel_250', 'RN_500','NTL'], 3)

names = ["LogisticRegression",
#          "Naive Bayes", 
#          "QDA",
         "Decision Tree", 
         "Random Forest", 
         "AdaBoost",
         "GradientBoosting"
        ]

classifiers = [
    LogisticRegression(random_state=0, solver='lbfgs', max_iter =150),
#     GaussianNB(),
#     QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(random_state=0, max_depth=2),
    RandomForestClassifier(n_estimators=100, random_state=0, max_depth=4),
    AdaBoostClassifier(n_estimators=100, random_state=0, learning_rate=1),
    GradientBoostingClassifier(n_estimators=100, random_state=0, learning_rate=0.1)
]

DF = clf_pres(data_names,names, classifiers)

data_name poi_sel_250 clf_name LogisticRegression overall_accuracy: 0.7854253133167628 f1_score 0.7589776106003601
data_name poi_sel_250 clf_name Decision Tree overall_accuracy: 0.8591486166330017 f1_score 0.8223580957429071
data_name poi_sel_250 clf_name Random Forest overall_accuracy: 0.8360158564767654 f1_score 0.8038865818876129
data_name poi_sel_250 clf_name AdaBoost overall_accuracy: 0.8363010867856873 f1_score 0.8041293936322809
data_name poi_sel_250 clf_name GradientBoosting overall_accuracy: 0.8361031718774558 f1_score 0.8039302086491127
data_name RN_500 clf_name LogisticRegression overall_accuracy: 0.7648421628606854 f1_score 0.7273925302782609
data_name RN_500 clf_name Decision Tree overall_accuracy: 0.7835218375817127 f1_score 0.7420077435911363
data_name RN_500 clf_name Random Forest overall_accuracy: 0.7963921276434738 f1_score 0.752070056352489
data_name RN_500 clf_name AdaBoost overall_accuracy: 0.7934816142871279 f1_score 0.7497431090565381
data_name RN_500 clf_name Gr

In [16]:
DF.head()

Unnamed: 0,data_name,clf_name,accuracy,f1_score,pred,pred_proba
0,poi_sel_250,LogisticRegression,0.785425,0.758978,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.910496367441936, 0.08950363255806403], [0...."
1,poi_sel_250,Decision Tree,0.859149,0.822358,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.9717330338158109, 0.028266966184189148], [..."
2,poi_sel_250,Random Forest,0.836016,0.803887,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.9717454135548619, 0.028254586445138345], [..."
3,poi_sel_250,AdaBoost,0.836301,0.804129,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.5088365902716536, 0.4911634097283464], [0...."
4,poi_sel_250,GradientBoosting,0.836103,0.80393,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.9716079959379437, 0.028392004062056376], [..."


In [15]:
np.vstack(DF['pred']).T.shape

(171791, 35)

In [17]:
pred = pd.DataFrame(np.vstack(DF['pred']).T)
pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
pred_proba_0 = pd.DataFrame(np.vstack([i[:,0] for i in np.array(DF['pred_proba'])]).T)
pred_proba_0.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.910496,0.971733,0.971745,0.508837,0.971608,0.953646,0.964993,0.992685,0.512226,0.992413,...,0.942517,0.981514,0.961411,0.508591,0.965393,0.95662,0.974762,0.973065,0.509622,0.97344
1,0.910496,0.971733,0.971745,0.508837,0.971608,0.953646,0.964993,0.992685,0.512226,0.992413,...,0.942517,0.981514,0.961411,0.508591,0.965393,0.95662,0.974762,0.973065,0.509622,0.97344
2,0.910496,0.971733,0.971745,0.508837,0.971608,0.953646,0.964993,0.992685,0.512226,0.992413,...,0.942517,0.981514,0.961411,0.508591,0.965393,0.95662,0.974762,0.973065,0.509622,0.97344
3,0.910496,0.971733,0.971745,0.508837,0.971608,0.953646,0.964993,0.992685,0.512226,0.992413,...,0.942517,0.981514,0.961411,0.508591,0.965393,0.95662,0.974762,0.973065,0.509622,0.97344
4,0.910496,0.971733,0.971745,0.508837,0.971608,0.953646,0.964993,0.992685,0.512226,0.992413,...,0.942517,0.981514,0.961411,0.508591,0.965393,0.95662,0.974762,0.973065,0.509622,0.97344


In [78]:
pred_proba_1 = pd.DataFrame(np.vstack([i[:,1] for i in np.array(DF['pred_proba'])]).T)
pred_proba_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
1,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
2,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
3,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
4,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656


In [103]:
DF.to_csv("data/pred_OA_F1.csv",index=False)
pred.to_csv("data/pred_test.csv",index=False)
pred_proba_0.to_csv("data/pred_proba0_test.csv",index=False)
pred_proba_1.to_csv("data/pred_proba1_test.csv",index=False)

## AUC

In [95]:
from sklearn.metrics import roc_auc_score

In [85]:
clean_data = pd.read_csv(r'data\ly_df_clean.csv')

In [86]:
Test = clean_data[clean_data['Train_Test'] == 1]['Target']

In [91]:
proba1 = pd.read_csv(r"data\pred_proba1_test.csv")

In [92]:
Test.head()

1471886    0.0
1471887    0.0
1471888    0.0
1471889    0.0
1471890    0.0
Name: Target, dtype: float64

In [93]:
proba1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
1,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
2,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
3,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656
4,0.089504,0.028267,0.028255,0.491163,0.028392,0.046354,0.035007,0.007315,0.487774,0.007587,...,0.057483,0.018486,0.038589,0.491409,0.034607,0.04338,0.025238,0.026935,0.490378,0.02656


In [96]:
y_true = Test
auc_list = []
for col in range(35):
    y_scores = proba1.iloc[:,col]
    auc = roc_auc_score(y_true, y_scores)
    list1.append(auc)

In [97]:
auc_list

[0.9187349985227917,
 0.89844707883568,
 0.9179978037426867,
 0.9175090981832759,
 0.9175387130063574,
 0.8669460710983833,
 0.82806811930277,
 0.8666903698188313,
 0.866587006149796,
 0.8666306471544971,
 0.9133999722542718,
 0.872653844959824,
 0.862067973051339,
 0.8766273205533109,
 0.872167228425904,
 0.9369236535102515,
 0.8753095952087578,
 0.9322329006074048,
 0.9318336950831304,
 0.9323878129404481,
 0.9518255531882956,
 0.9108525723863624,
 0.947761905590733,
 0.9472477511191131,
 0.9386209085984448,
 0.912449473718523,
 0.8632689566046489,
 0.9148313848088777,
 0.9060742278549949,
 0.8910381821955634,
 0.9454405720693422,
 0.9108525723863624,
 0.9469477161999336,
 0.9490215553408677,
 0.9355738772582822]

In [103]:
auc = pd.DataFrame(auc_list, columns=["auc"])

In [108]:
auc.to_csv('data/AUC.csv',index=False)