In [50]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# XGBoost
import xgboost as xgb

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

#dataset
from sklearn.datasets import load_boston

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.linear_model import Perceptron
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

# GridSearchCV
from sklearn.model_selection import GridSearchCV

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix
import warnings
import math
warnings.filterwarnings('ignore')

In [51]:
# CSVを読み込む
train= pd.read_csv("train.csv")
test= pd.read_csv("test.csv")

# データの統合
data = pd.concat([train, test], ignore_index = True)

# 提出用に
row_id = test["row_id"]

data

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70.0
1,1,1991-04-01 00:00:00,0,0,NB,49.0
2,2,1991-04-01 00:00:00,0,0,SB,24.0
3,3,1991-04-01 00:00:00,0,1,EB,18.0
4,4,1991-04-01 00:00:00,0,1,NB,60.0
...,...,...,...,...,...,...
851170,851170,1991-09-30 23:40:00,2,3,NB,
851171,851171,1991-09-30 23:40:00,2,3,NE,
851172,851172,1991-09-30 23:40:00,2,3,SB,
851173,851173,1991-09-30 23:40:00,2,3,SW,


In [52]:

# 全体の欠損データの個数確認
data_null = train.fillna(np.nan)
data_null.isnull().sum()

row_id        0
time          0
x             0
y             0
direction     0
congestion    0
dtype: int64

In [53]:
data.describe()

Unnamed: 0,row_id,x,y,congestion
count,851175.0,851175.0,851175.0,848835.0
mean,425587.0,1.138462,1.630769,47.815305
std,245713.202025,0.801478,1.089379,16.799392
min,0.0,0.0,0.0,0.0
25%,212793.5,0.0,1.0,35.0
50%,425587.0,1.0,2.0,47.0
75%,638380.5,2.0,3.0,60.0
max,851174.0,2.0,3.0,100.0


In [69]:
data1 = data[['x','y','direction','congestion']]
data1_dummies=pd.get_dummies(data1)
data1_dummies.head(3)

MemoryError: Unable to allocate 10.4 GiB for an array with shape (13095, 851175) and data type uint8

In [66]:
model_train = data1_dummies[:848835]
model_test = data1_dummies[848835:]

X = model_train.drop('congestion', axis=1)
Y = pd.DataFrame(model_train['congestion'])
x_test = model_test.drop('congestion', axis=1)
Y

Unnamed: 0,congestion
0,70.0
1,49.0
2,24.0
3,18.0
4,60.0
...,...
848830,54.0
848831,28.0
848832,68.0
848833,17.0


In [67]:
params = {'objective':'binary:logistic',
          'max_depth':5,
          'eta': 0.1, 
          'min_child_weight':1.0,
          'gamma':0.0,
          'colsample_bytree':0.8}

num_round = 1000

logloss = []
accuracy = []

kf = KFold(n_splits=4, shuffle=True, random_state=0)
for train_index, valid_index in kf.split(X):
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index] 
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]
    
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_valid, label=y_valid)
    
    xgb_params = {
        # 回帰問題
        'objective': 'reg:linear',
        # 学習用の指標 (RMSE)
        'eval_metric': 'rmse',
    }
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}
    bst = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=50000,
                    early_stopping_rounds=15,
                    evals=evals,
                    evals_result=evals_result,
                    )

    y_pred = bst.predict(dtest)
    mse = mean_squared_error(y_valid, y_pred)
    print('RMSE:', math.sqrt(mse))
    
    

[0]	train-rmse:36.52369	eval-rmse:36.57156
[1]	train-rmse:27.28302	eval-rmse:27.33112
[2]	train-rmse:21.33323	eval-rmse:21.37965
[3]	train-rmse:17.67190	eval-rmse:17.71465
[4]	train-rmse:15.19884	eval-rmse:15.24315
[5]	train-rmse:13.94846	eval-rmse:13.99160
[6]	train-rmse:12.91028	eval-rmse:12.94637
[7]	train-rmse:12.25898	eval-rmse:12.28980
[8]	train-rmse:11.91123	eval-rmse:11.93742
[9]	train-rmse:11.68406	eval-rmse:11.70842
[10]	train-rmse:11.56637	eval-rmse:11.58984
[11]	train-rmse:11.49800	eval-rmse:11.52051
[12]	train-rmse:11.42048	eval-rmse:11.44062
[13]	train-rmse:11.37296	eval-rmse:11.39223
[14]	train-rmse:11.34836	eval-rmse:11.36790
[15]	train-rmse:11.32157	eval-rmse:11.33989
[16]	train-rmse:11.31229	eval-rmse:11.33087
[17]	train-rmse:11.29802	eval-rmse:11.31547
[18]	train-rmse:11.29614	eval-rmse:11.31401
[19]	train-rmse:11.29329	eval-rmse:11.31110
[20]	train-rmse:11.28077	eval-rmse:11.29803
[21]	train-rmse:11.27501	eval-rmse:11.29188
[22]	train-rmse:11.27130	eval-rmse:11.2879

[70]	train-rmse:11.26829	eval-rmse:11.26004
[71]	train-rmse:11.26829	eval-rmse:11.26004
[72]	train-rmse:11.26829	eval-rmse:11.26004
[73]	train-rmse:11.26829	eval-rmse:11.26004
[74]	train-rmse:11.26829	eval-rmse:11.26004
[75]	train-rmse:11.26829	eval-rmse:11.26004
[76]	train-rmse:11.26829	eval-rmse:11.26004
[77]	train-rmse:11.26829	eval-rmse:11.26004
[78]	train-rmse:11.26829	eval-rmse:11.26004
[79]	train-rmse:11.26829	eval-rmse:11.26004
[80]	train-rmse:11.26829	eval-rmse:11.26004
[81]	train-rmse:11.26829	eval-rmse:11.26004
[82]	train-rmse:11.26829	eval-rmse:11.26004
[83]	train-rmse:11.26829	eval-rmse:11.26004
[84]	train-rmse:11.26829	eval-rmse:11.26004
[85]	train-rmse:11.26829	eval-rmse:11.26004
[86]	train-rmse:11.26829	eval-rmse:11.26004
[87]	train-rmse:11.26829	eval-rmse:11.26004
[88]	train-rmse:11.26829	eval-rmse:11.26004
[89]	train-rmse:11.26829	eval-rmse:11.26004
[90]	train-rmse:11.26829	eval-rmse:11.26004
[91]	train-rmse:11.26829	eval-rmse:11.26004
[92]	train-rmse:11.26829	eval-rm

In [68]:
#predictで予測
dtest = xgb.DMatrix(x_test)
y_pred = bst.predict(dtest)
submission = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred})
submission.to_csv('submission1.csv', index=False)