In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [2]:
import numpy as np
import random
import os
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed=42
seed_everything(seed) # Seed 고정

#### 데이터 불러오기 및 전처리

In [3]:
train = pd.read_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/processed_train.csv')
test = pd.read_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/processed_test.csv')

In [4]:
train

Unnamed: 0,TARGET,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,2.890372,Chrome,Macintosh,desktop,0,0.803566,0.722274,not_bounced,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,NAN
1,1.386294,Chrome,Windows,desktop,1,0.000000,0.395696,not_bounced,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,NAN
2,0.693147,Samsung Internet,Android,mobile,1,0.000000,0.000000,bounced,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),NAN,NAN
3,0.693147,Chrome,Macintosh,desktop,1,0.000000,0.000000,bounced,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,NAN,NAN
4,0.693147,Chrome,iOS,mobile,0,0.000000,0.000000,bounced,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,NAN,Category6_Path_0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252284,0.693147,Chrome,Android,mobile,1,0.000000,0.000000,bounced,0.0,0.0,Europe,Northern Europe,United Kingdom,youtube.com,referral,NAN,Category5_Path_0032
252285,0.693147,Chrome,Macintosh,desktop,0,0.000000,0.000000,bounced,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,NAN
252286,1.791759,Chrome,Macintosh,desktop,0,0.103913,0.455725,not_bounced,0.0,0.0,Americas,Northern America,United States,(direct),(none),NAN,Category1
252287,0.693147,Android Webview,Android,mobile,1,0.000000,0.361201,not_bounced,0.0,0.0,Africa,Northern Africa,Egypt,youtube.com,referral,NAN,Category2_Path_0018


In [5]:
categorical_features = [
"browser",
"OS",
"device",
"continent",
"subcontinent",
"country",
"traffic_source",
"traffic_medium",
"keyword",
"referral_path",
]
for i in categorical_features:
    train[i] = train[i].astype('category')
    test[i] = test[i].astype('category')

train_X = train.drop('TARGET', axis=1)
train_y = train['TARGET']

In [6]:
train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.3, random_state=seed)

In [7]:
train_X['TARGET'] = train_y
train_X = train_X[train['bounced']=='not_bounced']
train_y = train_X['TARGET']
train_X = train_X.drop(['TARGET', 'bounced'], axis=1)

  train_X = train_X[train['bounced']=='not_bounced']


#### 모델 학습

##### CatBoost



In [None]:
train_pool = Pool(data=train_X, label=train_y, cat_features=categorical_features)
clf = CatBoostRegressor(random_state=seed, verbose=False)
clf.fit(train_pool)

<catboost.core.CatBoostRegressor at 0x7993fa122200>

In [None]:
feature_importance = clf.feature_importances_
for idx, importance in zip(train_X.columns, feature_importance):
  print("{}: {:.2f}%".format(idx, importance))

browser: 2.42%
OS: 3.95%
device: 1.32%
new: 3.69%
quality: 44.61%
duration: 22.71%
transaction: 2.09%
continent: 1.61%
subcontinent: 9.39%
country: 1.09%
traffic_source: 2.89%
traffic_medium: 1.95%
keyword: 0.96%
referral_path: 1.31%


##### XGBoost

In [None]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6, random_state = seed, enable_categorical=True)
xgb_model.fit(train_X, train_y)

In [None]:
pd.Series(xgb_model.feature_importances_, index= train_X.columns).sort_values(ascending=False)

quality                0.298025
subcontinent           0.130162
country                0.081401
transaction            0.076456
new                    0.067740
duration               0.066007
traffic_source         0.051885
referral_path          0.039004
browser                0.038629
OS                     0.036516
keyword                0.031124
traffic_medium         0.026789
transaction_revenue    0.025077
device                 0.021018
continent              0.010168
dtype: float32

#### KFold

##### CatBoost

In [8]:
kf = KFold(n_splits=5)
models = []
for train_index, test_index in kf.split(train_X):
    models.append(CatBoostRegressor(random_state=seed, verbose=False))
    kfold_train_X, kfold_train_y = train_X.iloc[train_index], train_y.iloc[train_index]
    kfold_test_X,  kfold_test_y  = train_X.iloc[test_index],  train_y.iloc[test_index]
    train_pool = Pool(data=kfold_train_X, label=kfold_train_y, cat_features=categorical_features)
    models[-1].fit(train_pool)

In [16]:
valid_pool = Pool(data=valid_X.drop('bounced',axis=1), cat_features=categorical_features)
# pred = np.array([models[0].predict(valid_pool)])
pred = np.array(np.expm1([models[0].predict(valid_pool)]))

for i in range(1, 5):
  # pred = np.append(pred, np.array([models[i].predict(valid_pool)]),axis=0)
  pred = np.append(pred, np.array(np.expm1([models[i].predict(valid_pool)])), axis=0)

pred = np.mean(pred, axis=0)
valid_y = np.expm1(valid_y)
pred

array([14.65358598,  1.73163479,  2.54625223, ...,  1.31879937,
        7.20684984,  2.5881186 ])

In [17]:
test_pool = Pool(data=test.drop('bounced', axis=1), cat_features=categorical_features)
#pred = np.array([models[0].predict(test_pool)])
pred = np.array(np.expm1([models[0].predict(test_pool)]))
for i in range(1, 5):
  #pred = np.append(pred, np.array([models[i].predict(test_pool)]),axis=0)
  pred = np.append(pred, np.expm1(np.array([models[i].predict(test_pool)])),axis=0)
pred = np.mean(pred, axis=0)
pred

array([24.49875595,  1.26824789,  2.51775257, ...,  2.57899611,
        4.50505456,  1.85625631])

#### 모델 평가

In [10]:
def RMSE(y, pred):
  return np.sqrt(mean_squared_error(y, pred))

In [None]:
### CatBoost
valid_pool = Pool(data=valid_X.drop('bounced',axis=1), cat_features=categorical_features)
pred = clf.predict(valid_pool)
pred

array([14.30296553,  1.74810435,  2.56110448, ...,  1.6537653 ,
        6.83618826,  2.72301469])

In [None]:
### XGBoost
pred = xgb_model.predict(valid_X.drop('bounced',axis=1))
pred

array([10.990601 ,  1.9393274,  2.740711 , ...,  1.4078379,  7.7134166,
        2.6377053], dtype=float32)

In [None]:
### TARGET 값 log 변환 적용 시
pred = np.expm1(pred)
valid_y = np.expm1(valid_y)

In [14]:
pred = [1 if i < 1 else i for i in pred]
for i, bounced in enumerate(valid_X.bounced):
  if bounced == 'bounced':
    pred[i] = 1

In [15]:
RMSE(valid_y, pred)

2.5136340574794604

#### 예측값 출력

In [None]:
### CatBoost
test_pool = Pool(data=test.drop('bounced', axis=1), cat_features=categorical_features)
pred = clf.predict(test_pool)
pred

In [None]:
### XGBoost
pred = xgb_model.predict(test.drop('bounced', axis=1))
pred

In [None]:
### TARGET 값 log 변환 적용 시
pred = np.expm1(pred)

In [None]:
pred = [1 if i < 1 else i for i in pred]
for i, bounced in enumerate(test.bounced):
  if bounced == 'bounced':
    pred[i] = 1

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/sample_submission.csv')
submit['TARGET'] = pred

In [None]:
submit.to_csv("sample_submission.csv", index=False)