In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.special import inv_boxcox
from catboost import CatBoostRegressor
from scipy.stats import boxcox, zscore
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

sns.set_style(style = 'white')
pd.set_option('display.max_columns', None)

In [2]:
training_data = pd.read_csv('Dataset/train.csv')
testing_data = pd.read_csv('Dataset/test.csv')

train = training_data.copy()
test = testing_data.copy()

del train['row_id']
del test['row_id']

print(train.shape)
print(test.shape)

train.head()

(89197, 9)
(11121, 8)


Unnamed: 0,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,19990,37,128,24,Male,Student,180,1000,4.33
1,5304,32,132,14,Female,Student,330,714,1.79
2,1840,12,24,19,Male,Student,180,138,4.35
3,12597,23,112,19,Male,Student,220,613,3.77
4,13626,23,112,27,Male,Working Professional,220,613,3.13


### Preprocessing

Encoding Categorical feature with numerical values

In [3]:
gender_dict = {'Male':0, 'Female':1}
profession_dict = {'Working Professional':0, 'Other':1, 'Student':2}

train['gender'] = train.gender.replace(gender_dict)
train['profession'] = train.profession.replace(profession_dict)

test['gender'] = test.gender.replace(gender_dict)
test['profession'] = test.profession.replace(profession_dict)

Creating new features

In [4]:
train['gender/age'] = train['gender'] / train['age']
train['gender/views'] = train['gender'] / train['views']
train['log(views)/age'] = np.log(train['views']) / train['age']
train['user_id**2/age'] = (train['user_id'] ** 2) / train['age']
train['category_id**3/age'] = (train['category_id'] ** 3) / train['age']
train['age*exp(profession)'] = train['age'] * (np.exp(train['profession']))
train['user_id**2*views**3'] = (train['user_id'] ** 2) * (train['views'] ** 3)
train['category_id**2/video_id'] = (train['category_id'] ** 2) / train['video_id']
train['sqrt(user_id)/followers'] = np.sqrt(train['user_id']) / train['followers']
train['log(category_id)/followers'] = np.log(train['category_id']) / train['followers']
train['age*user_id'] = train['age'] * train['user_id']
train['followers*sqrt(user_id)'] = train['followers'] * np.sqrt(train['user_id'])
train['age**2*category_id'] = (train['age'] ** 2) * train['category_id']
train['log(user_id**2/age)'] = np.log(train['user_id**2/age'])
train['1/age**2*category_id'] = 1 / train['age**2*category_id']
train['1/followers*sqrt(user_id)'] = 1 / train['followers*sqrt(user_id)']

test['gender/age'] = test['gender'] / test['age']#
test['gender/views'] = test['gender'] / test['views']#
test['log(views)/age'] = np.log(test['views']) / test['age']#
test['user_id**2/age'] = (test['user_id'] ** 2) / test['age']#
test['category_id**3/age'] = (test['category_id'] ** 3) / test['age']#
test['age*exp(profession)'] = test['age'] * (np.exp(test['profession']))#
test['user_id**2*views**3'] = (test['user_id'] ** 2) * (test['views'] ** 3)#
test['category_id**2/video_id'] = (test['category_id'] ** 2) / test['video_id']#
test['sqrt(user_id)/followers'] = np.sqrt(test['user_id']) / test['followers']#
test['log(category_id)/followers'] = np.log(test['category_id']) / test['followers']#
test['age*user_id'] = test['age'] * test['user_id']#
test['followers*sqrt(user_id)'] = test['followers'] * np.sqrt(test['user_id'])#
test['age**2*category_id'] = (test['age'] ** 2) * test['category_id']#
test['log(user_id**2/age)'] = np.log(test['user_id**2/age'])
test['1/age**2*category_id'] = 1 / test['age**2*category_id']
test['1/followers*sqrt(user_id)'] = 1 / test['followers*sqrt(user_id)']

Feature Selection and Train-Test Splitting

In [5]:
features = ['user_id', 'video_id', 'gender/age', 'gender/views', 'log(views)/age', 'user_id**2/age',
            'category_id**3/age', 'age*exp(profession)', 'user_id**2*views**3',
            'category_id**2/video_id', 'sqrt(user_id)/followers', 'log(category_id)/followers',
            'age*user_id', 'followers*sqrt(user_id)', 'age**2*category_id', 'log(user_id**2/age)',
            '1/age**2*category_id', '1/followers*sqrt(user_id)']

y = boxcox(train['engagement_score'] + 1, 2.85009)
X = train[features]
test = test[features]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = .15, random_state = 89)

### Model 1

In [None]:
param = {'reg_alpha': 0.5071016819855272,
         'reg_lambda': 0.09575110521583012,
         'colsample_bytree': 0.31709249417149765,
         'subsample': 0.46186005802999774,
         'learning_rate': 0.017892817849788113,
         'num_leaves': 505,
         'min_child_samples': 1,
         'min_data_per_groups': 36.6295891093262}

In [None]:
score_r2 = []
pred_list = []

fold = KFold(n_splits = 6, shuffle = True, random_state = 42)

for train_index, test_index in fold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LGBMRegressor(**param, random_state = 34, metric = 'rmse', n_estimators = 30000)
    model.fit(X_train, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds = 100)
    y_pred = model.predict(X_test)
    
    print('R2 Score:', r2_score(y_test, y_pred))
    
    score_r2.append(r2_score(y_test, y_pred))
    predict = model.predict(test)
    pred_list.append(predict)

In [None]:
np.mean(score_r2, 0)

In [None]:
pred4 = (np.mean(pred_list, 0))
pred4 = inv_boxcox(pred4, 2.85009) - 1

submission = pd.DataFrame({'row_id' : testing_data.row_id, 'engagement_score': pred4})
submission.to_csv('prediction.csv', index = False)

### Model - 2

In [None]:
score_r2 = []
pred_list = []

fold = KFold(n_splits = 8, shuffle = True, random_state = 13)

for train_index, test_index in fold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LGBMRegressor(**param, random_state = 34, metric = 'rmse', n_estimators = 30000)
    model.fit(X_train, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds = 100)
    y_pred = model.predict(X_test)
    
    print('R2 Score:', r2_score(y_test, y_pred))
    
    score_r2.append(r2_score(y_test, y_pred))
    predict = model.predict(test)
    pred_list.append(predict)

In [None]:
np.mean(score_r2, 0)

In [None]:
pred5 = (np.mean(pred_list, 0))
pred5 = inv_boxcox(pred5, 2.85009) - 1

submission = pd.DataFrame({'row_id' : testing_data.row_id, 'engagement_score': pred5})
submission.to_csv('prediction.csv', index = False)

### Averaging

In [None]:
p = (pred4 + pred5) / 2

submission = pd.DataFrame({'row_id' : testing_data.row_id, 'engagement_score': p})
submission.to_csv('prediction.csv', index = False)