# YouTube trending videos

In [None]:
# import sys
# sys.path.append("..")

from numpy import nan
%matplotlib inline
import random
import xgboost as xgb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from lime.lime_tabular import LimeTabularExplainer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import recall_score, f1_score, precision_score, plot_roc_curve, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from helpers.files import load_csv

plt.rcParams["figure.facecolor"] = "#a9a9a9"
random.seed(0)
np.random.seed(0)


def print_stats(clf, x_train, y_train, x_test, y_test):
    y_train_pred = clf.predict(x_train)
    print(f"Train accuracy: {accuracy_score(y_train, y_train_pred)}")
    print(f"Train precision: {precision_score(y_train, y_train_pred)}")
    print(f"Train recall: {recall_score(y_train, y_train_pred)}")
    print(f"Train F1: {f1_score(y_train, y_train_pred)}\n")
    ax = plt.gca()
    plot_roc_curve(clf, x_train, y_train, name="Train", ax=ax)

    y_test_pred = clf.predict(x_test)
    print(f"Test accuracy: {accuracy_score(y_test, y_test_pred)}")
    print(f"Test precision: {precision_score(y_test, y_test_pred)}")
    print(f"Test recall: {recall_score(y_test, y_test_pred)}")
    print(f"Test F1: {f1_score(y_test, y_test_pred)}\n")
    plot_roc_curve(clf, x_test, y_test, name="Test", ax=ax)
    plt.show()

### Wczytanie danych

In [None]:
load_filtered = True
if load_filtered:
    videos = load_csv("ped6_filtered_data")[0]
else:
    videos = load_csv("ped6_data")[0]

videos = videos.sample(frac=1).reset_index(drop=True)
videos.head(5)
print(len(videos))

### Przygotowanie danych

In [None]:
y = videos["trending"]
plt.hist(y)
x = videos.loc[:, videos.columns != "trending"]

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=0)

x_train_stats = x_train.mean()

x_train = x_train.fillna(x_train_stats)
x_test = x_test.fillna(x_train_stats)

scaler = MinMaxScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x.columns)

print(f"Train: {x_train.shape}")
print(f"Test: {x_test.shape}")
x_train.head(5)

### Wybór miar oceny klasyfikatorów

Accuracy
TODO wyjaśnij wybór.

### Wybór pierwszego klasyfikatora

RandomForest
TODO uzasadnij wybór.

In [None]:
param_grid = {
    "reduce_dim__k": [5, 10, 20, 25, 30],  # range(1, 50, 5),
    "classifier__max_depth": [4, 8, 10],
    "classifier__min_samples_leaf": [1, 2],  # 2, 4],
    "classifier__n_estimators": [10, 50, 100, 200]

}

pipeline = Pipeline([
    ("reduce_dim", SelectKBest(chi2)),
    ("classifier", RandomForestClassifier())
])

grid_search = GridSearchCV(pipeline, n_jobs=8, param_grid=param_grid, cv=10, verbose=1, scoring="f1")
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)


### Uczenie pierwszego klasyfikatora

In [None]:
k = grid_search.best_params_["reduce_dim__k"]
max_depth = grid_search.best_params_["classifier__max_depth"]
min_samples_leaf = grid_search.best_params_["classifier__min_samples_leaf"]
n_estimators = grid_search.best_params_["classifier__n_estimators"]
# Tree on Random Forest
select = SelectKBest(chi2, k=k)

x_train_selected = select.fit_transform(x_train, y_train)
x_test_selected = select.transform(x_test)

mask = select.get_support()
new_features = [feature for supported, feature in zip(mask, x.columns.values) if supported]

x_train_selected = pd.DataFrame(x_train_selected, columns=new_features)
x_test_selected = pd.DataFrame(x_test_selected, columns=new_features)

rf = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_leaf=min_samples_leaf
)
rf.fit(x_train_selected, y_train)


### Testowanie pierwszego klasyfikatora

In [None]:
print_stats(rf, x_train_selected, y_train, x_test_selected, y_test)
rf_pred = lambda x: rf.predict_proba(x).astype(float)


### Interpretacja predykcji pierwszego klasyfikatora

In [None]:
explainerRF = shap.TreeExplainer(rf)

shap_values_RF_test = explainerRF.shap_values(x_test_selected, approximate=True)
shap_values_RF_train = explainerRF.shap_values(x_train_selected, approximate=True)
# Random Forest
#TODO check index
# df_shap_RF_test = pd.DataFrame(shap_values_RF_test[1], columns=x_test.columns.values)
# df_shap_RF_train = pd.DataFrame(shap_values_RF_train[1], columns=x_train.columns.values)

# LIME has one explainer for all models
explainer = LimeTabularExplainer(
    x_train_selected.values,
    feature_names=x_train_selected.columns.values.tolist(),
    training_labels=[0, 1],
    class_names=['Non trending', "Trending"],
    verbose=True,
)

j = 0

exp = explainer.explain_instance(x_test_selected.loc[[j]].values[0], rf_pred)
exp.save_to_file("lime.html")
# exp.show_in_notebook(show_table=True)
fig = exp.as_pyplot_figure()
plt.show(fig)

In [None]:
j = 123
x_test_selected.loc[[j]]

In [None]:
# shap.force_plot(explainerRF.expected_value, shap_values_RF_test[j], x_test.iloc[[j]])
print(y_test.to_numpy()[j])
shap_val = explainerRF.shap_values(x_test_selected.loc[[j]], approximate=True)
print(np.array(shap_val).shape)
# initialize js for SHAP

shap.initjs()
shap.force_plot(explainerRF.expected_value[1], shap_val[1], x_test_selected.loc[[j]])


In [None]:
j = 2000
x_test_selected.loc[[j]]

In [None]:
print(y_test.to_numpy()[j])
# shap.force_plot(explainerRF.expected_value, shap_values_RF_test[j], x_test.iloc[[j]])
shap_val = explainerRF.shap_values(x_test_selected.loc[[j]], approximate=True)
print(np.array(shap_val).shape)
# initialize js for SHAP
shap.initjs()
shap.force_plot(explainerRF.expected_value[1], shap_val[1], x_test_selected.loc[[j]])


In [None]:
shap.summary_plot(shap_val, x_test_selected.loc[[j]])

### Interpretacja treningowych danych

In [None]:
shap.summary_plot(shap_values_RF_train[1], x_train_selected)

### Interpretacja testowych danych

In [None]:
shap.summary_plot(shap_values_RF_test[1], x_test_selected)

# TODO pomyśl o filtrowaniu danych
# TODO pomyśl o odfiltrowaniu innych języków

### Wybór drugiego klasyfikatora

Jako drugi klasyfikator został wybrany XGBoost, ponieważ jest znany z tego że osiąga dobre wyniki (nawet na niezbalansowanych danych) jak i bardzo dobrze wyjaśnialny, poprzez to że w swojej mechanice ma zawarte ważności cech oraz prez zastosowanie boostingu oraz regularyzacji wewnątrz modelu.

### Testowanie klasyfikatora

In [None]:
xgbclf_base = xgb.XGBClassifier(n_estimators=1000, use_label_encoder=False, verbosity=0)

print(xgbclf_base.fit(x_train, y_train))

y_pred = xgbclf_base.predict(x_test)
print(classification_report(y_test, y_pred, target_names=['trending', 'non-trending']))

### Uczenie drugiego klasyfikatora

**Selekcja cech**

Selekcja cech została przeporwadzona poprzez analizę upadku trafności wraz z ograniczaniem liczby cech modelu

In [None]:
from sklearn.feature_selection import SelectFromModel

thresholds = sorted(xgbclf_base.feature_importances_)

selection_model = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False, verbosity=0)
selection_model.fit(x_train, y_train)
print("Base accuracy: %.2f%%" % (accuracy_score(y_test, selection_model.predict(x_test)) * 100))

for thresh in thresholds:
    selection = SelectFromModel(xgbclf_base, threshold=thresh, prefit=True)
    select_x_train = selection.transform(x_train)
    selection_model = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False, verbosity=0)
    selection_model.fit(select_x_train, y_train)
    select_x_test = selection.transform(x_test)
    y_pred = selection_model.predict(select_x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_x_train.shape[1], accuracy * 100.0), accuracy)

Jak rozsądną wartość ilość cech została ograniczona do 51

### Interpretacja predykcji

In [None]:
thresh = thresholds[-51]
selection = SelectFromModel(xgbclf_base, threshold=thresh, prefit=True)
x_train_selected = selection.transform(x_train)

selection_model = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False, verbose=0)
selection_model.fit(x_train_selected, y_train)

x_test_selected = selection.transform(x_test)
y_pred = selection_model.predict(x_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, x_train_selected.shape[1], accuracy * 100.0))

mask = selection.get_support()
new_features = [feature for supported, feature in zip(mask, x.columns.values) if supported]

x_train_selected = pd.DataFrame(x_train_selected, columns=new_features)
x_test_selected = pd.DataFrame(x_test_selected, columns=new_features)

**Strojenie parametrów**

Strojenie odbywa się na modelu o mniejszej ilości estymatorów (ze względu na czas obliczeń)

In [None]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 2, 5],
    'max_depth': [3, 6, 9, 15]
}
xgbclf_par = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False, nthread=1, verbosity=0)
grid_search = GridSearchCV(xgbclf_par, n_jobs=8, param_grid=params, cv=3, verbose=1)
grid_search.fit(x_train_selected, y_train)
print(grid_search.best_params_)

In [None]:
print(grid_search.cv_results_['mean_test_score'])
print(grid_search.best_score_)

### Uczenie drugiego klasyfikatora

In [None]:
min_child_weight = grid_search.best_params_["min_child_weight"]
max_depth = grid_search.best_params_["max_depth"]
gamma = grid_search.best_params_["gamma"]

xgbclf = xgb.XGBClassifier(n_estimators=1000, use_label_encoder=False, nthread=-1, verbosity=0,
                           objective='binary:logistic',
                           min_child_weight=min_child_weight,
                           max_depth=max_depth,
                           gamma=gamma)

xgbclf.fit(x_train_selected, y_train)

In [None]:
print_stats(xgbclf, x_train_selected, y_train, x_test_selected, y_test)
y_pred = xgbclf.predict(x_test_selected)
print("Test results: \n", classification_report(y_test, y_pred, target_names=['trending', 'non-trending']))

Ważność atrybutów wg. weight pokazuje jak często w drzewach pojawiają się dane cechy

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
xgb.plot_importance(xgbclf, max_num_features=50, height=0.5, ax=ax, importance_type='weight')
plt.show()

Ważność atrybutów wg. gain ozancza jak dużą część trafności wnosi dana cecha

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
xgb.plot_importance(xgbclf, max_num_features=50, height=0.5, ax=ax, importance_type='gain')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
xgb.plot_importance(xgbclf, max_num_features=50, height=0.5, ax=ax, importance_type='gain')
plt.show()

In [None]:
xgbclf_base = xgb.XGBClassifier(n_estimators=10, use_label_encoder=False, verbosity=0)

print(xgbclf_base.fit(x_train, y_train))

In [37]:
plt.rcParams['figure.figsize'] =  [500, 500]
xgb.plot_tree(xgbclf_base,num_trees=0)
plt.show()

# fig = plt.gcf()
# fig.set_size_inches(1000, 500)
# fig.show()

### SHAP

In [None]:
plt.rcParams['figure.figsize'] = [8, 6]
xgb_pred = lambda x: xgbclf.predict_proba(x).astype(float)

# LIME has one explainer for all models,

explainer = LimeTabularExplainer(
    x_train_selected.values,
    feature_names=x_train.columns.values.tolist(),
    training_labels=[0, 1],
    class_names=['Non trending', "Trending"],
    verbose=True)

j = 0

exp = explainer.explain_instance(x_test_selected.loc[[j]].values[0], xgb_pred)
exp.save_to_file("lime2.html")
# exp.show_in_notebook(show_table=True),
fig = exp.as_pyplot_figure()
plt.show(fig)

In [None]:
x_sampled = x_test_selected.sample(100, random_state=0)
explainerXgb = shap.TreeExplainer(xgbclf)
shap_values = explainerXgb.shap_values(x_sampled)

In [None]:
shap.force_plot(explainerXgb.expected_value, shap_values[1], x_sampled.iloc[1])

In [None]:
shap_values = explainerXgb.shap_values(x_train_selected)
shap.force_plot(explainerXgb.expected_value, shap_values[1], x_train_selected.iloc[1])

In [None]:
shap_values = explainerXgb.shap_values(x_sampled)
shap.summary_plot(shap_values, x_sampled)

In [None]:
shap.summary_plot(shap_values, x_sampled, plot_type="bar")

### Testowanie drugiego klasyfikatora

### Interpretacja predykcji drugiego klasyfikatora

### Porównanie wyników klasyfikatorów

### Wiedza dla klienta

Profil charakterystycznych wartości atrybutów dla klasy trending

Co trzeba robić?

Czego się wystrzegać?