In [None]:
import functools
import json
import pandas as pd
import numpy as np
import datetime

# Load and pre process

In [None]:
!gdown <ID_data.json>
!gdown <ID_cover-vgg.json>

In [None]:
covers = pd.read_json('/content/cover-vgg.json')
covers

In [None]:
full = pd.read_json('/content/data.json')
full

In [None]:
full = full.merge(covers, on='id')

In [None]:
full = full[full['cover'].notna()].copy()

In [None]:
len(full)

In [None]:
def func(acc, val):
  acc.update(val)
  return acc

categories = functools.reduce(func, full['categories'], set())

for cat in categories:
  full['cat-' + cat] = full['categories'].apply(lambda x: 1 if cat in x else 0)

In [None]:
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

def func(acc, val):
  acc.update(val)
  return acc

tags = functools.reduce(func, full['tags'], set())

for cat in tags:
  full['tag-' + cat] = full['tags'].apply(lambda x: 1 if cat in x else 0)

In [None]:
df = full[full['label'].notna()].copy()

In [None]:
len(df)

# Train

In [None]:
df.columns[0:20]

In [None]:
regularColumns = ['views', 'pages', 'chapters', 'score', 'votes', 'uploaded', 'cover']
catColumns = list(filter(lambda x: x.startswith('cat-'), list(df.columns)))
tagColumns = list(filter(lambda x: x.startswith('tag-'), list(df.columns)))
X = [*regularColumns, *catColumns, *tagColumns]
y = ['label']
cat_features = []
embedding_features=[]

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size=0.6, random_state=42)

In [None]:
! pip install catboost
from catboost import CatBoostClassifier
from catboost import cv, Pool

In [None]:
train_data = Pool(data=train[X],
                  label=train[y],
                  cat_features=cat_features,
                  embedding_features=embedding_features,
                 )
test_data = Pool(data=test[X],
                  label=test[y],
                  cat_features=cat_features,
                  embedding_features=embedding_features,
                 )

In [None]:
parameters = {
    'cat_features': cat_features,
    'embedding_features': embedding_features,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'verbose': 100,
    'random_seed': 42,
    'learning_rate': 0.01,
}

In [None]:
cv_data = cv(
    params = parameters,
    pool = train_data,
    fold_count=5,
    partition_random_seed=42,
    verbose=False,
    early_stopping_rounds=200,
)

In [None]:
n_iters = cv_data[cv_data['test-AUC-mean'] == cv_data['test-AUC-mean'].max()]['iterations'].values[0]
n_iters

In [None]:
model = CatBoostClassifier(**parameters, iterations=n_iters)
model.fit(train_data)

# Analyze

In [None]:
import sklearn.metrics as metrics

def uplift(df, score, pct):
  exited_all = df['label'].sum()
  df = df.sort_values(score, ascending=False)
  exited_found = df.head(round(len(df) * pct))['label'].sum()

  return (exited_found / exited_all) / pct;

def auc(df, score):
  fpr, tpr, threshold = metrics.roc_curve(df['label'], df[score])
  return metrics.auc(fpr, tpr)

def print_metrics(df, score):
  print("log_loss", metrics.log_loss(df['label'], df[score]))
  print("uplift", uplift(df, score, 0.2))
  print("auc", auc(df, score))

In [None]:
test['label_pred_score'] = model.predict_proba(test[X])[:,1]
print_metrics(test, 'label_pred_score')

In [None]:
model.get_feature_importance(prettified=True).head(20)

In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot

fpr, tpr, thresholds = roc_curve(test['label'], test['label_pred_score'])
# plot the roc curve for the model
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
from numpy import sqrt
from numpy import argmax

gmeans = sqrt(tpr * (1-fpr))
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

# Predict

In [None]:
model = CatBoostClassifier(**parameters, iterations=n_iters)
model.fit(df[X], df[y])

In [None]:
test_df = full[full['label'].isna()].copy()
len(test_df)

In [None]:
test_df['score_pred'] = model.predict_proba(test_df[X])[:,1]
test_df['score_pred'].hist()

In [None]:
test_df.sort_values(by=['score_pred'], ascending=False).head(20)

In [None]:
with open("result.json", "w") as outfile:
    json.dump(list(test_df.sort_values(by=['score_pred'], ascending=False).head(20)['name']), outfile)