In [1]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# XGBoost
import xgboost as xgb

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

#dataset
from sklearn.datasets import load_boston

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.linear_model import Perceptron
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

# GridSearchCV
from sklearn.model_selection import GridSearchCV

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix
import warnings
import math
warnings.filterwarnings('ignore')

In [2]:
# CSVを読み込む
test= pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
train= pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv")

# データの統合
data = pd.concat([train, test], ignore_index = True)

# 提出用に
row_id = test["row_id"]

data["time"] = data['time'].str[-8:-3]
data

In [3]:
data1 = data[['time','x','y','direction','congestion']]
data1_dummies=pd.get_dummies(data1)
data1_dummies.head(3)

In [4]:
model_train = data1_dummies[:848835]
model_test = data1_dummies[848835:]

X = model_train.drop('congestion', axis=1)
Y = pd.DataFrame(model_train['congestion'])
x_test = model_test.drop('congestion', axis=1)

In [6]:
params = {'objective':'binary:logistic',
          'max_depth':5,
          'eta': 0.1, 
          'min_child_weight':1.0,
          'gamma':0.0,
          'colsample_bytree':0.8}

num_round = 1000

logloss = []
accuracy = []

kf = KFold(n_splits=4, shuffle=True, random_state=0)
for train_index, valid_index in kf.split(X):
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index] 
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]
    
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_valid, label=y_valid)
    
    xgb_params = {
        # 回帰問題
        'objective': 'reg:linear',
        # 学習用の指標 (RMSE)
        'eval_metric': 'rmse',
        'tree_method':'gpu_hist'
    }
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    evals_result = {}
    bst = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=50000,
                    early_stopping_rounds=15,
                    evals=evals,
                    evals_result=evals_result,
                    verbose_eval=200
                    )

    y_pred = bst.predict(dtest)
    mse = mean_squared_error(y_valid, y_pred)
    print('RMSE:', math.sqrt(mse))
    
    

In [8]:
#predictで予測
dtest = xgb.DMatrix(x_test)
y_pred = bst.predict(dtest)
submission = pd.DataFrame({'row_id':test['row_id'], 'congestion':y_pred})
submission.to_csv('submission2.csv', index=False)