In [35]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
import numpy as np
import os
import pandas as pd
import random
from sklearn.preprocessing import MinMaxScaler


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


random_seed = 42
seed_everything(random_seed) # Seed 고정

In [37]:
train = pd.read_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/test.csv')

In [38]:
train.fillna('NAN', inplace=True)
test.fillna('NAN', inplace=True)

#### 범주형 데이터 처리

In [39]:
train['bounced']=train['bounced'].replace({0:'not_bounced',1:'bounced'})
test['bounced']=test['bounced'].replace({0:'not_bounced',1:'bounced'})

In [40]:
# train['new']=train['new'].replace({0:'not_visited',1:'visited'})
# test['new']=test['new'].replace({0:'not_visited',1:'visited'})

In [41]:
# def get_transaction_status(transaction):
#   return 'N' if transaction == 0 else 'Y'
# train['transaction_status'] = train['transaction'].map(get_transaction_status)
# test['transaction_status'] = test['transaction'].map(get_transaction_status)

In [42]:
# def keyword_trim(keyword):
#   if pd.isna(keyword):
#     return 'None'
#   else:
#     return keyword.split('_')[0]
# train['keyword']=train['keyword'].map(keyword_trim)
# test['keyword']=test['keyword'].map(keyword_trim)

In [43]:
# def referral_path_trim(referral_path):
#   if pd.isna(referral_path):
#     return 'None'
#   else:
#     return referral_path.split('_')[0]
# train['referral_path']=train['referral_path'].map(referral_path_trim)
# test['referral_path']=test['referral_path'].map(referral_path_trim)

In [44]:
# def edit_traffic_medimum(data):
#   if data['traffic_medium'] != 'referral' and data['referral_path'] != 'NAN':
#     data['traffic_medium'] = 'referral'
#   if data['traffic_medium'] == '(not set)':
#     data['traffic_medium'] = '(none)'
#   return data

# train = train.apply(edit_traffic_medimum, axis=1)
# test = test.apply(edit_traffic_medimum, axis=1)

In [45]:
# def traffic_source_integrate(traffic_source):
#   keywords = ['google','youtube','(direct)']
#   for keyword in keywords:
#     if keyword in traffic_source:
#       return keyword
#   return 'etc'
# train['traffic_source']=train['traffic_source'].map(traffic_source_integrate)
# test['traffic_source']=test['traffic_source'].map(traffic_source_integrate)

In [52]:
# def keyword_referral_path_integrate(data):
#   data['keyword_or_referral_path'] = 'NAN'
#   if data['keyword'] != 'NAN':
#     data['keyword_or_referral_path'] = data['keyword']
#   if data['referral_path'] != 'NAN':
#     data['keyword_or_referral_path'] = data['referral_path']
#   return data
# train = train.apply(keyword_referral_path_integrate, axis=1)
# test = test.apply(keyword_referral_path_integrate, axis=1)

In [47]:
def log_numeric_features(data):
  features = ['quality','duration']
  for feature in features:
    data[feature]=np.log1p(data[feature])
  return data
train = log_numeric_features(train)
test = log_numeric_features(test)
train['TARGET'] = np.log1p(train['TARGET'])

#### 수치형 변수 스케일링

In [48]:
features = ['quality','duration','transaction','transaction_revenue']

scaler = MinMaxScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

#### 불필요 컬럼 제거

In [49]:
drop_table = ['sessionID','userID']
train = train.drop(drop_table,axis=1)
test = test.drop(drop_table,axis=1)

In [50]:
train

Unnamed: 0,TARGET,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,2.890372,Chrome,Macintosh,desktop,0,0.803566,0.722274,not_bounced,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,NAN
1,1.386294,Chrome,Windows,desktop,1,0.000000,0.395696,not_bounced,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,NAN
2,0.693147,Samsung Internet,Android,mobile,1,0.000000,0.000000,bounced,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),NAN,NAN
3,0.693147,Chrome,Macintosh,desktop,1,0.000000,0.000000,bounced,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,NAN,NAN
4,0.693147,Chrome,iOS,mobile,0,0.000000,0.000000,bounced,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,NAN,Category6_Path_0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252284,0.693147,Chrome,Android,mobile,1,0.000000,0.000000,bounced,0.0,0.0,Europe,Northern Europe,United Kingdom,youtube.com,referral,NAN,Category5_Path_0032
252285,0.693147,Chrome,Macintosh,desktop,0,0.000000,0.000000,bounced,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,NAN
252286,1.791759,Chrome,Macintosh,desktop,0,0.103913,0.455725,not_bounced,0.0,0.0,Americas,Northern America,United States,(direct),(none),NAN,Category1
252287,0.693147,Android Webview,Android,mobile,1,0.000000,0.361201,not_bounced,0.0,0.0,Africa,Northern Africa,Egypt,youtube.com,referral,NAN,Category2_Path_0018


#### 전처리 완료 후 csv 형태로 저장

In [51]:
train.to_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/processed_train.csv', index=False)
test.to_csv('/content/drive/MyDrive/웹 로그 기반 조회수 예측 해커톤/data/processed_test.csv', index=False)