In [8]:
import pandas as pd
import pycaret
from missingpy import MissForest
import warnings
warnings.filterwarnings(action='ignore')
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.combine import SMOTETomek,SMOTEENN
from imblearn.over_sampling import ADASYN,SMOTE,RandomOverSampler

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## id 제거 및 컬럼명 변경

In [10]:
del train['id']
del test['id']

train.columns = ['시간', '기온', '비', '풍속', '습도', '가시성', '오존', '미세먼지', '초미세먼지', 'count']
test.columns = ['시간', '기온', '비', '풍속', '습도', '가시성', '오존', '미세먼지', '초미세먼지']

## 데이터 확인

In [255]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   시간      1459 non-null   int64  
 1   기온      1457 non-null   float64
 2   비       1457 non-null   float64
 3   풍속      1450 non-null   float64
 4   습도      1457 non-null   float64
 5   가시성     1457 non-null   float64
 6   오존      1383 non-null   float64
 7   미세먼지    1369 non-null   float64
 8   초미세먼지   1342 non-null   float64
 9   count   1459 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 114.1 KB


In [256]:
train.describe()

Unnamed: 0,시간,기온,비,풍속,습도,가시성,오존,미세먼지,초미세먼지,count
count,1459.0,1457.0,1457.0,1450.0,1457.0,1457.0,1383.0,1369.0,1342.0,1459.0
mean,11.493489,16.717433,0.031572,2.479034,52.231297,1405.216884,0.039149,57.168736,30.327124,108.5634
std,6.92279,5.23915,0.174917,1.378265,20.370387,583.131708,0.019509,31.771019,14.713252,82.631733
min,0.0,3.1,0.0,0.0,7.0,78.0,0.003,9.0,8.0,1.0
25%,5.5,12.8,0.0,1.4,36.0,879.0,0.0255,36.0,20.0,37.0
50%,11.0,16.6,0.0,2.3,51.0,1577.0,0.039,51.0,26.0,96.0
75%,17.5,20.1,0.0,3.4,69.0,1994.0,0.052,69.0,37.0,150.0
max,23.0,30.0,1.0,8.0,99.0,2000.0,0.125,269.0,90.0,431.0


In [257]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 715 entries, 0 to 714
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   시간      715 non-null    int64  
 1   기온      714 non-null    float64
 2   비       714 non-null    float64
 3   풍속      714 non-null    float64
 4   습도      714 non-null    float64
 5   가시성     714 non-null    float64
 6   오존      680 non-null    float64
 7   미세먼지    678 non-null    float64
 8   초미세먼지   679 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 50.4 KB


In [258]:
test.describe()

Unnamed: 0,시간,기온,비,풍속,습도,가시성,오존,미세먼지,초미세먼지
count,715.0,714.0,714.0,714.0,714.0,714.0,680.0,678.0,679.0
mean,11.472727,23.263305,0.051821,2.388515,56.668067,1359.494398,0.041196,36.930678,24.939617
std,6.928427,4.039645,0.22182,1.17127,19.66188,614.17187,0.020656,12.641503,10.075857
min,0.0,14.6,0.0,0.0,16.0,126.0,0.003,9.0,7.0
25%,5.5,20.3,0.0,1.5,42.0,807.25,0.027,28.0,17.0
50%,11.0,22.9,0.0,2.3,55.0,1489.0,0.039,35.0,24.0
75%,17.0,26.375,0.0,3.275,73.75,2000.0,0.05225,45.0,31.0
max,23.0,33.8,1.0,5.9,100.0,2000.0,0.138,94.0,69.0


## 시간 빼고 다 결측치인 두 행 제거

In [269]:
train.drop([934,1035], inplace = True)
train = train.reset_index(drop = True)

## 결측치 MissForest로 채우기

In [316]:
train['count'] = np.log1p(train['count'])

In [317]:
train['비'] = train['비'].astype('category')
test['비'] = test['비'].astype('category')
train_cols = list(train.columns)
test_cols = list(test.columns)
train_cat_cols = [train.columns.get_loc(col) for col in train.select_dtypes(['category']).columns.tolist()]
test_cat_cols = [test.columns.get_loc(col) for col in test.select_dtypes(['category']).columns.tolist()]

imputer = MissForest(n_estimators=1000)
train_imputed = imputer.fit_transform(train, cat_vars=[2])
test_imputed = imputer.fit_transform(test, cat_vars=[2])

train = pd.DataFrame(train_imputed, columns=train_cols)
test = pd.DataFrame(test_imputed, columns=test_cols)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6


In [318]:
from pycaret.regression import *
exp = setup(train,'count')
a = compare_models(fold=30,cross_validation=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.2979,0.2012,0.4326,0.8016,0.1032,0.0955,0.332
et,Extra Trees Regressor,0.2986,0.2099,0.4418,0.7925,0.1043,0.0956,0.047
lightgbm,Light Gradient Boosting Machine,0.3137,0.2283,0.4626,0.7718,0.1098,0.0999,0.0077
rf,Random Forest Regressor,0.3162,0.2458,0.4763,0.7574,0.1133,0.1025,0.0663
gbr,Gradient Boosting Regressor,0.3219,0.2548,0.4852,0.75,0.1172,0.1046,0.017
xgboost,Extreme Gradient Boosting,0.3204,0.2484,0.4835,0.745,0.1176,0.1014,0.0187
ridge,Ridge Regression,0.4732,0.437,0.6487,0.5575,0.1497,0.1501,0.0023
br,Bayesian Ridge,0.4734,0.4377,0.6491,0.5572,0.1498,0.1504,0.003
lr,Linear Regression,0.4733,0.4362,0.6487,0.5566,0.1493,0.1494,0.0027
dt,Decision Tree Regressor,0.4081,0.4425,0.6421,0.5525,0.1535,0.1274,0.0033


# 비

In [243]:
test.loc[test.비.isna(),'비'] = 0
train.비 = train.비.astype('int64')
test.비 = test.비.astype('int64')

## 시간

In [11]:
train.groupby('시간')['count'].agg('mean').sort_values()

시간
5      13.114754
4      13.524590
3      21.377049
6      24.557377
2      31.409836
1      47.606557
7      62.360656
0      71.766667
10     78.803279
11     88.327869
9      93.540984
23    103.852459
12    111.901639
13    120.050000
14    134.590164
8     136.688525
22    148.245902
15    152.967213
20    164.868852
21    168.816667
16    169.100000
17    187.133333
19    201.606557
18    262.163934
Name: count, dtype: float64

In [251]:
t = train.groupby('시간')['count'].agg('mean').sort_values()
for i, d in enumerate(list(t.index)):
    train.loc[train['시간'] == d, '시간'] = chr(i + 97)
    test.loc[test['시간'] == d, '시간'] = chr(i + 97)
def time(x):
    return ord(x) - 96
train.시간 = train.시간.apply(time)
test.시간 = test.시간.apply(time)

In [103]:
def time2(x):
    if x == 1 or x == 2:
        x = 1
    elif x == 3 or x == 4:
        x = 2
    elif x == 8 or x == 9:
        x = 8
    elif x == 10 or x == 11:
        x = 10
    elif x == 15 or x == 16:
        x = 15
    elif x == 17 or x == 18:
        x = 17
    elif x == 19 or x == 20 or x == 21:
        x = 19
    return x
train.시간 = train.시간.apply(time2)
test.시간 = test.시간.apply(time2)

In [104]:
le = LabelEncoder()

In [105]:
train.시간 = le.fit_transform(train.시간)
test.시간 = le.transform(test.시간)

In [106]:
train.groupby('시간')['count'].agg('mean').sort_values()

시간
0      13.319672
1      22.967213
2      31.409836
3      47.606557
4      62.360656
5      75.616667
6      90.934426
7     103.852459
8     111.901639
9     120.050000
10    135.639344
11    150.606557
12    167.580110
13    187.133333
14    201.606557
15    266.516667
Name: count, dtype: float64

## 기온

In [107]:
def temp(x):
    if x <= 14.6:
        x = 14.6
    return x
train.기온 = train.기온.apply(temp)

In [108]:
def temp1(x):
    x //= 2.5
    return x
train.기온 = train.기온.apply(temp1)
test.기온 = test.기온.apply(temp1)

In [109]:
def temp2(x):
    if x <= 3:
        x = 2
    elif x >= 9:
        x = 9
    return x
train.기온 = train.기온.apply(temp2)
test.기온 = test.기온.apply(temp2)

In [110]:
train.기온 = le.fit_transform(train.기온)
test.기온 = le.transform(test.기온)

In [111]:
train.groupby('기온')['count'].agg('mean').sort_values()

기온
0     59.784965
1     90.045267
2    121.164179
3    167.108974
4    200.619266
Name: count, dtype: float64

## 풍속

In [113]:
def wind(x):
    if x >= 5.9:
        x = 5.9
    return x
train.풍속 = train.풍속.apply(wind)

In [114]:
def wind1(x):
    x //= 1
    return x
train.풍속 = train.풍속.apply(wind1)
test.풍속 = test.풍속.apply(wind1)

In [115]:
t = train.groupby('풍속')['count'].agg('mean').sort_values()
for i, d in enumerate(list(t.index)):
    train.loc[train['풍속'] == d, '풍속'] = chr(i + 97)
    test.loc[test['풍속'] == d, '풍속'] = chr(i + 97)
train.풍속 = train.풍속.apply(time)
test.풍속 = test.풍속.apply(time)

In [116]:
def wind2(x):
    if x == 6:
        x -= 1
    return x
train.풍속 = train.풍속.apply(wind2)
test.풍속 = test.풍속.apply(wind2)

In [117]:
train.groupby('풍속')['count'].agg('mean').sort_values()

풍속
1     53.408867
2     74.550964
3    108.151282
4    143.575439
5    172.925926
Name: count, dtype: float64

## 습도

In [119]:
def humi(x):
    if x <= 16:
        x = 16
    return x
train.습도 = train.습도.apply(humi)

In [120]:
def humi1(x):
    x //= 10.1
    return x
train.습도 = train.습도.apply(humi1)
test.습도 = test.습도.apply(humi1)

In [121]:
t = train.groupby('습도')['count'].agg('mean').sort_values()
for i, d in enumerate(list(t.index)):
    train.loc[train['습도'] == d, '습도'] = chr(i + 97)
    test.loc[test['습도'] == d, '습도'] = chr(i + 97)
train.습도 = train.습도.apply(time)
test.습도 = test.습도.apply(time)

In [122]:
def humi2(x):
    if x == 1:
        x += 1
    elif x == 9:
        x -= 1
    return x
train.습도 = train.습도.apply(humi2)
test.습도 = test.습도.apply(humi2)

In [123]:
train.습도 = le.fit_transform(train.습도)
test.습도 = le.transform(test.습도)

## 가시성

In [124]:
def see(x):
    if x <= 126:
        x = 126
    return x
train.가시성 = train.가시성.apply(see)

In [125]:
def see1(x):
    x //= 100
    return x
train.가시성 = train.가시성.apply(see1)
test.가시성 = test.가시성.apply(see1)

In [126]:
t = train.groupby('가시성')['count'].agg('mean').sort_values()
for i, d in enumerate(list(t.index)):
    train.loc[train['가시성'] == d, '가시성'] = chr(i + 97)
    test.loc[test['가시성'] == d, '가시성'] = chr(i + 97)
train.가시성 = train.가시성.apply(time)
test.가시성 = test.가시성.apply(time)

## 오존

## 미세먼지, 초미세먼지

In [127]:
def dust(x):
    x //= 10
    return x
train.미세먼지 = train.미세먼지.apply(dust)
test.미세먼지 = test.미세먼지.apply(dust)
train.초미세먼지 = train.초미세먼지.apply(dust)
test.초미세먼지 = test.초미세먼지.apply(dust)

In [128]:
t = train.groupby('미세먼지')['count'].agg('mean').sort_values()
for i, d in enumerate(list(t.index)):
    train.loc[train['미세먼지'] == d, '미세먼지'] = chr(i + 97)
    test.loc[test['미세먼지'] == d, '미세먼지'] = chr(i + 97)
train.미세먼지 = train.미세먼지.apply(time)
test.미세먼지 = test.미세먼지.apply(time)

t1 = train.groupby('초미세먼지')['count'].agg('mean').sort_values()
for i, d in enumerate(list(t1.index)):
    train.loc[train['초미세먼지'] == d, '초미세먼지'] = chr(i + 97)
    test.loc[test['초미세먼지'] == d, '초미세먼지'] = chr(i + 97)
train.초미세먼지 = train.초미세먼지.apply(time)
test.초미세먼지 = test.초미세먼지.apply(time)

In [244]:
for df in [train,test]:
    for i in cat:
        df[i] = df[i].astype('int64')
train['count'] = train['count'].astype('int64')

In [262]:
x = train.drop(['count'], axis = 1)
y = train['count']

In [256]:
sub = pd.read_csv('submission.csv')

In [269]:
n_est = 2000
seed = 42
n_fold = 5

target = 'count'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

In [273]:
kfold = KFold(n_splits=n_fold, shuffle=True, random_state=seed)
folds=[]
for train_idx, valid_idx in kfold.split(X, y):
        folds.append((train_idx, valid_idx))

cat_pred = np.zeros((X.shape[0]))
cat_pred_test = np.zeros((X_test.shape[0]))

cat = ['시간', '기온', '비', '풍속', '습도', '가시성', '미세먼지', '초미세먼지']

In [275]:
for fold in range(n_fold):
    print(f'\n----------------- Fold {fold} -----------------\n')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
    train_data = Pool(data=X_train, label=y_train, cat_features=cat)
    valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat)

    model_cat = CatBoostRegressor()
    model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=100)

    cat_pred[valid_idx] = model_cat.predict(X_valid)
    cat_pred_test += model_cat.predict(X_test) / n_fold


----------------- Fold 0 -----------------

Learning rate set to 0.052157
0:	learn: 79.6201771	test: 81.8679719	best: 81.8679719 (0)	total: 30.2ms	remaining: 30.2s
100:	learn: 37.9568995	test: 39.7618480	best: 39.7618480 (100)	total: 3.87s	remaining: 34.4s
200:	learn: 33.2198195	test: 38.4726662	best: 38.4641053 (195)	total: 8.33s	remaining: 33.1s
300:	learn: 30.0540917	test: 38.2625171	best: 38.1884238 (282)	total: 13s	remaining: 30.1s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 38.1884238
bestIteration = 282

Shrink model to first 283 iterations.

----------------- Fold 1 -----------------

Learning rate set to 0.052157
0:	learn: 80.7070346	test: 78.5884243	best: 78.5884243 (0)	total: 33.7ms	remaining: 33.6s
100:	learn: 38.3495193	test: 42.7581307	best: 42.7462947 (97)	total: 4.7s	remaining: 41.8s
200:	learn: 33.4274112	test: 42.2820443	best: 42.1469004 (150)	total: 9.4s	remaining: 37.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 42.146

In [277]:
sub['count'] = cat_pred_test

In [279]:
sub.to_csv('cat.csv', index = False)