In [1]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# XGBoost
import xgboost as xgb

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.linear_model import Perceptron
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
# CSVを読み込む
train= pd.read_csv("train.csv")
test= pd.read_csv("test.csv")

# データの統合
data = pd.concat([train, test], ignore_index = True)

# 提出用に
row_id = test["row_id"]

data

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70.0
1,1,1991-04-01 00:00:00,0,0,NB,49.0
2,2,1991-04-01 00:00:00,0,0,SB,24.0
3,3,1991-04-01 00:00:00,0,1,EB,18.0
4,4,1991-04-01 00:00:00,0,1,NB,60.0
...,...,...,...,...,...,...
851170,851170,1991-09-30 23:40:00,2,3,NB,
851171,851171,1991-09-30 23:40:00,2,3,NE,
851172,851172,1991-09-30 23:40:00,2,3,SB,
851173,851173,1991-09-30 23:40:00,2,3,SW,


In [3]:

# 全体の欠損データの個数確認
data_null = train.fillna(np.nan)
data_null.isnull().sum()

row_id        0
time          0
x             0
y             0
direction     0
congestion    0
dtype: int64

In [4]:
data.describe()

Unnamed: 0,row_id,x,y,congestion
count,851175.0,851175.0,851175.0,848835.0
mean,425587.0,1.138462,1.630769,47.815305
std,245713.202025,0.801478,1.089379,16.799392
min,0.0,0.0,0.0,0.0
25%,212793.5,0.0,1.0,35.0
50%,425587.0,1.0,2.0,47.0
75%,638380.5,2.0,3.0,60.0
max,851174.0,2.0,3.0,100.0


In [5]:
data1 = data[['x','y','direction','congestion']]
data1_dummies=pd.get_dummies(data1)
data1_dummies.head(3)

Unnamed: 0,x,y,congestion,direction_EB,direction_NB,direction_NE,direction_NW,direction_SB,direction_SE,direction_SW,direction_WB
0,0,0,70.0,1,0,0,0,0,0,0,0
1,0,0,49.0,0,1,0,0,0,0,0,0
2,0,0,24.0,0,0,0,0,1,0,0,0


In [6]:
model_train = data1_dummies[:848834]
model_test = data1_dummies[848834:]

X = model_train.drop('congestion', axis=1)
Y = pd.DataFrame(model_train['congestion'])
x_test = model_test.drop('congestion', axis=1)
Y

Unnamed: 0,congestion
0,70.0
1,49.0
2,24.0
3,18.0
4,60.0
...,...
848829,48.0
848830,54.0
848831,28.0
848832,68.0


In [7]:
params = {'objective':'binary:logistic',
          'max_depth':5,
          'eta': 0.1, 
          'min_child_weight':1.0,
          'gamma':0.0,
          'colsample_bytree':0.8,
          'subsample':0.8}

num_round = 1000

logloss = []
accuracy = []

kf = KFold(n_splits=4, shuffle=True, random_state=0)
for train_index, valid_index in kf.split(X):
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index] 
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]
    #データフレームをxgboostに適した形に変換
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_valid, label=y_valid)
    dtest = xgb.DMatrix(x_test)
    #xgboostで学習
    watchlist = [(dtrain,'train'),(dvalid,'eval')]
    model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=50)

    valid_pred_proba = model.predict(dvalid)
    #loglossを求める
    score = log_loss(y_valid, valid_pred_proba)
    logloss.append(score)
    #accuracyを求める
    #valid_pred_probaは確率値なので0と1に変換
    valid_pred = np.where(valid_pred_proba >0.5,1,0)
    acc = accuracy_score(y_valid, valid_pred)
    accuracy.append(acc)

print(f'log_loss:{np.mean(logloss)}')
print(f'accuracy:{np.mean(accuracy)}')

XGBoostError: [22:19:08] C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/objective/regression_obj.cu:138: label must be in [0,1] for logistic regression