# YouTube trending videos

In [1]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


import sys
sys.path.append("..")


import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from itertools import product
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.cluster import KMeans
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import accuracy_score, f1_score, precision_score, silhouette_score, roc_auc_score, recall_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.semi_supervised import LabelSpreading

from helpers.categories import get_categories_dict
from helpers.files import load_csv

random.seed(0)
np.random.seed(0)

## Etap 4 - Wykorzystanie uczenia pół-nadzorowanego


Deklaracja potrzebnych funkcji

In [2]:

def show_stats(y_hat, y_hat_nans, y, mapped: bool = False):
    if not mapped:
        mapping_dict = get_mapping_dict(y_hat_nans, y)
        print(mapping_dict)
        y_mapped = pd.Series(y).apply(lambda x: mapping_dict[x])
    else:
        y_mapped = pd.Series(y.astype(np.int))
        y_hat = y_hat.astype(np.int)
    print(f"Accuracy: {accuracy_score(y_hat, y_mapped)}")
    print(f"F1: {f1_score(y_hat, y_mapped, average='macro')}")
    print(f"Precision: {precision_score(y_hat, y_mapped, average='macro')}")
    print(f"Recall: {recall_score(y_hat, y_mapped, average='macro')}")
    print(f"Hat unique:{np.unique(y_hat.to_numpy())}")
    print(f"labeled unique: {np.unique(y_mapped.to_numpy())}")
    print(f"y_hat_nans : {np.unique(y_hat_nans[y_hat_nans.notna()].to_numpy())}")
    y_hat = y_hat.to_numpy().reshape(-1, 1)
    y_mapped = y_mapped.to_numpy().reshape(-1, 1)
    encoder = OneHotEncoder(sparse=False)
    one_hot_hat = encoder.fit_transform(y_hat)
    y_hot = encoder.transform(y_mapped)
    print(f"ROC AUC Score OVR: {roc_auc_score(one_hot_hat, y_hot, average='micro', multi_class='ovr')}")
    print(f"ROC AUC Score OVO: {roc_auc_score(one_hot_hat, y_hot, multi_class='ovo')}")

### Wyczytanie danych

In [3]:
gb_data, us_data = load_csv("clustering_data")

videos = pd.concat([gb_data, us_data])
# inconsistency in data
videos["category_id"] = videos["category_id"].replace(43.0, 24.0)

categories_ids = videos["new_category_id"].dropna().unique().tolist()

### Wczytanie nazw kategorii

In [4]:
categories_dict = get_categories_dict()
categories = [categories_dict[cat] for cat in categories_ids]

## Usunięcie danych tekstowych

In [5]:
not_nan_bool = videos["new_category_id"].notna().reset_index(drop=True)

videos = videos[videos["new_category_id"].notna()]
videos = videos.reset_index(drop=True)
y_hat = videos["new_category_id"]  # oczekiwane kategorie z api
y_hat_nans = videos["category_id"]  # oczekiwane kategorie z nanami (z oryginalnego zbioru)
videos_no_nan_categories = videos[videos["category_id"].notna()]
selected_columns = [
    "views", "likes", "dislikes", "comment_count", "description_len", "title_len", "channel_title_len",
    "publish_time_day_of_week", "publish_time_hour_of_day",
    "gray_mean_score", "color_mean_score", "gray_hist_score",
    "red_hist_score", "green_hist_score", "blue_hist_score", "edges_score", "entropy_score",
]

for cat in categories_ids:
    selected_columns.append(f"freq_channel_titles_{categories_dict[cat]}")
    selected_columns.append(f"freq_titles_{categories_dict[cat]}")
    selected_columns.append(f"freq_tags_{categories_dict[cat]}")
    selected_columns.append(f"freq_descriptions_{categories_dict[cat]}")

videos = videos[selected_columns]

### Uzupełnienie wartości pustych wartościami średnimi


In [6]:
videos = videos.fillna(videos.mean())

### Skalowanie danych

In [7]:
scaler = MinMaxScaler()
videos = scaler.fit_transform(videos)

### Przygotowanie zbioru z oznaczonymi kategoriami

In [8]:
x_not_nan = videos[y_hat_nans.notna()]
y_not_nan = y_hat_nans[y_hat_nans.notna()]

x_only_nan = videos[y_hat_nans.isna()]
y_only_nan = y_hat[y_hat_nans.isna()]

### Constrained K-means

In [9]:
def get_mapping_dict(y_hat_nans, y_pred) -> dict:
    values_counts = y_hat_nans.value_counts().reset_index(name="count")
    # print(values_counts)
    # print(f"Index: {values_counts.index}")
    most_frequent_class = values_counts["index"].iloc[0]
    # print(most_frequent_class)
    mapping_dict = {}
    y_data = pd.DataFrame()
    y_data["y_hat_nans"] = y_hat_nans
    y_data["y"] = y_pred
    y_data["c"] = 1
    y_data = y_data.groupby(["y", "y_hat_nans"])["c"].sum()  #.max(level=[0])#.sort_values().groupby(level=0)
    # print(y_data)
    out = y_data.loc[y_data.groupby(level=0).idxmax()]
    # print(out)
    for row in out.index:
        mapping_dict[row[0]] = row[1]
    for i in range(np.unique(y_pred).shape[0]):
        if i not in mapping_dict.keys():
            mapping_dict[i] = most_frequent_class
    return mapping_dict



In [10]:

groups_index = np.unique(y_not_nan)
groups_dict = {e: i for i, e in enumerate(groups_index)}
num_groups = len(groups_index)

groups = [[] for _ in range(num_groups)]
for xp, yp in zip(x_not_nan, y_not_nan):
    groups[groups_dict[yp]].append(xp)

centroids = []
for g in groups:
    centroids.append(np.mean(g, axis=0))

centroids = np.array(centroids)


Losowe K-means z wykorzystaniem wiedzy o przybliżonych pozycjach centroidów

In [165]:
best_model = None
best_score = 0.0
for itr in range(300):
    centroids_prim = centroids + np.random.normal(scale = (np.max(centroids) - np.min(centroids))/100,size = centroids.shape)
    model = KMeans(n_clusters=num_groups, init=centroids_prim, n_init=1)
    y = model.fit_predict(videos)
    mapping_dict = get_mapping_dict(y_hat_nans, y)
    y_mapped = pd.Series(y).apply(lambda x: mapping_dict[x])
    score = f1_score(y_hat, y_mapped, average='macro')
    # print(score)
    if best_score < score:
        best_score = score
        best_model = model
labels = best_model.predict(videos)
show_stats(y_hat, y_hat_nans, labels)

{0: 1.0, 1: 19.0, 2: 10.0, 3: 15.0, 4: 17.0, 5: 25.0, 6: 10.0, 7: 24.0, 8: 24.0, 9: 24.0, 10: 25.0, 11: 24.0, 12: 27.0, 13: 28.0, 14: 24.0}
Accuracy: 0.43319977103606183
F1: 0.2925408018857068
Precision: 0.38351892318903374
Recall: 0.2752732104847086
Hat unique:[ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
labeled unique: [ 1. 10. 15. 17. 19. 24. 25. 27. 28.]
y_hat_nans : [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
ROC AUC Score OVR: 0.6963570201978903
ROC AUC Score OVO: 0.6132347296313955


Semi supervised Constrained KMeans (punkt 3.1 z https://arxiv.org/pdf/1806.01547v2.pdf)

In [164]:
new_centroids = centroids[:]
for itr in range(300):
    model = KMeans(n_clusters=num_groups, init=new_centroids, n_init=1, max_iter=1)
    labels = model.fit_predict(x_only_nan)
    new_groups = [[] for _ in range(num_groups)]
    for i, l in enumerate(labels):
        new_groups[l].append(x_only_nan[i])

    new_centroids = []
    for i, g in enumerate(new_groups):
        g = g + groups[i]
        new_centroids.append(np.mean(g, axis=0))
    new_centroids = np.array(new_centroids)

y = model.predict(videos)
for i,e in zip(range(len(y)),y_hat_nans.notna()):
    if e == True:
        y[i] = y_hat_nans[i]
show_stats(y_hat, y_hat_nans, y)


{1: 1.0, 2: 2.0, 10: 10.0, 15: 15.0, 17: 17.0, 19: 19.0, 20: 20.0, 22: 22.0, 23: 23.0, 24: 24.0, 25: 25.0, 26: 26.0, 27: 27.0, 28: 28.0, 29: 29.0, 0: 10.0, 3: 10.0, 4: 10.0, 5: 10.0, 6: 10.0, 7: 10.0, 8: 10.0, 9: 10.0, 11: 10.0, 12: 10.0, 13: 10.0, 14: 10.0, 16: 10.0, 18: 10.0, 21: 10.0}
Accuracy: 0.38946765884373213
F1: 0.4171949647412629
Precision: 0.8526385183407648
Recall: 0.3277318087400635
Hat unique:[ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
labeled unique: [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
y_hat_nans : [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
ROC AUC Score OVR: 0.6729291029519995
ROC AUC Score OVO: 0.639332942495129


### Label Spreading

Zastosowanie tej metody jest uargumentowane występowaniem więcej niż jednego elementu
posiadającego przypisaną kategorię dla każdej z nich. W przeciwieństwie do algorytmu
K-means tutaj przypisywanie kategorii kolejnym elementom opiera się na propagacji
ich na podstawie najbliższych sąsiadów (jest to zmodyfikowana metoda *Label Propagation*).

**Przygotowanie parametrów**

In [166]:
parameters = {'k': np.arange(5, x_not_nan.shape[1], 5), 'kernel': ["knn", "rbf"], 'gamma': [1, 10, 20, 30, 40],
              'n_neighbors': [3, 5, 7, 11]}
params = list(product(*parameters.values()))

**Funkcja do wyświetlania statystyk**

In [167]:
x_not_nan_train, x_not_nan_test, y_not_nan_train, y_not_nan_test = train_test_split(x_not_nan, y_not_nan.to_numpy())

def show_stats_labels(y_hat, y):
    print(f"Accuracy: {accuracy_score(y_hat, y)}")
    print(f"F1: {f1_score(y_hat, y, average='macro')}")
    print(f"Precision: {precision_score(y_hat, y, average='macro')}")
    print(f"Recall: {recall_score(y_hat, y, average='macro')}")
    y_hat = y_hat.reshape(-1, 1)
    y = y.reshape(-1, 1)
    encoder = OneHotEncoder(sparse=False)
    one_hot_hat = encoder.fit_transform(y_hat)
    y_hot = encoder.transform(y)
    print(f"ROC AUC Score OVR: {roc_auc_score(one_hot_hat, y_hot, multi_class='ovr')}")
    print(f"ROC AUC Score OVO: {roc_auc_score(one_hot_hat, y_hot, multi_class='ovo')}")

**Strojenie parametrów**

Parametry stroimy przy użyciu walidacji krzyżowej. Jako danych
używamy pełnego zbioru filmów, które w oryginalnym zestawie
danych miały przypisane kategorie filmów. Miara F1 jest
używana do oceny wytrenowanych modeli, która następnie
uśredniana jest po 10 przebiegach walidacji.


In [168]:
best_score = 0
best_params = None
scores = []

k_splits = 10
cv = StratifiedKFold(n_splits=k_splits)

for p in tqdm(params):
    k, kernel, g, n = p
    select = SelectKBest(chi2, k=k)
    select.fit(x_not_nan_train, y_not_nan_train)
    x = select.transform(x_not_nan_train)
    mean_score = 0
    for i, (train, val) in enumerate(cv.split(x, y_not_nan_train)):
        x_train = x[train]
        y_train = y_not_nan_train[train]
        x_val = x[val]
        y_val = y_not_nan_train[val]
        model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
        model.fit(x_train, y_train)
        y = model.predict(x_val)
        score = f1_score(y_val, y, average="macro")
        mean_score += score
    mean_score /= k_splits
    scores.append((mean_score, p))
    if mean_score > best_score:
        best_score = mean_score
        best_params = p


print(f"Best params: {best_params}")
print(f"BestScore: {best_score}")
print(f"Scores: {scores}")


HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))


Best params: (30, 'rbf', 40, 3)
BestScore: 0.6221276527777467
Scores: [(0.21386724473288896, (5, 'knn', 1, 3)), (0.20636088164238328, (5, 'knn', 1, 5)), (0.21996867326510383, (5, 'knn', 1, 7)), (0.21752790721077703, (5, 'knn', 1, 11)), (0.21386724473288896, (5, 'knn', 10, 3)), (0.20636088164238328, (5, 'knn', 10, 5)), (0.21996867326510383, (5, 'knn', 10, 7)), (0.21752790721077703, (5, 'knn', 10, 11)), (0.21386724473288896, (5, 'knn', 20, 3)), (0.20636088164238328, (5, 'knn', 20, 5)), (0.21996867326510383, (5, 'knn', 20, 7)), (0.21752790721077703, (5, 'knn', 20, 11)), (0.21386724473288896, (5, 'knn', 30, 3)), (0.20636088164238328, (5, 'knn', 30, 5)), (0.21996867326510383, (5, 'knn', 30, 7)), (0.21752790721077703, (5, 'knn', 30, 11)), (0.21386724473288896, (5, 'knn', 40, 3)), (0.20636088164238328, (5, 'knn', 40, 5)), (0.21996867326510383, (5, 'knn', 40, 7)), (0.21752790721077703, (5, 'knn', 40, 11)), (0.02731757517926724, (5, 'rbf', 1, 3)), (0.02731757517926724, (5, 'rbf', 1, 5)), (0.02

**Trenowanie i testowanie ostatecznego modelu**

In [169]:
k, kernel, g, n = best_params

select = SelectKBest(chi2, k=k)
select.fit(x_not_nan_train, y_not_nan_train)
x_not_nan_train = select.transform(x_not_nan_train)
x_not_nan_test = select.transform(x_not_nan_test)
x_only_nan_unlabeled = select.transform(x_only_nan)
model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
model.fit(x_not_nan_train, y_not_nan_train)

print("Test results")
y_test = model.predict(x_not_nan_test)
show_stats_labels(y_not_nan_test, y_test)
print("\n\n")
print("Unlabeled results")
y_unlabeled = model.predict(x_only_nan_unlabeled)
show_stats_labels(y_only_nan.to_numpy(), y_unlabeled)

Test results
Accuracy: 0.6681286549707602
F1: 0.6760567080177909
Precision: 0.8646619676174947
Recall: 0.6009032279172944
ROC AUC Score OVR: 0.7864588394322053
ROC AUC Score OVO: 0.7864588394322053



Unlabeled results
Accuracy: 0.6221259580139953
F1: 0.5068458786290019
Precision: 0.6905857610968378
Recall: 0.4465149363170359
ROC AUC Score OVR: 0.707268400974978
ROC AUC Score OVO: 0.707268400974978


Model uzyskał trafność na poziomie 62%. Podobny wynik został
osiągnięty na kryterium precyzji (około 69%). Wyniki te wskazują, że
klasyfikator nie przydziela po prostu najliczniejszej z kategorii.
Niestety wynik na kryterium *recall* wyniósł tylko około 44%.

**Dodatnie TfIdf**

In [170]:
from ped4.tfidf import load_videos_with_tf_idf
videos = load_videos_with_tf_idf()

videos_not_nan = videos[y_hat_nans.notna()]
y_not_nan = y_hat_nans[y_hat_nans.notna()]

videos_only_nan = videos[y_hat_nans.isna()]

videos_not_nan_train, videos_not_nan_test, y_not_nan_train, y_not_nan_test \
    = train_test_split(videos_not_nan, y_not_nan.to_numpy())

parameters = {'k': np.arange(5, 100, 5), 'kernel': ["knn", "rbf"], 'gamma': [1, 10, 20, 30, 40],
              'n_neighbors': [3, 5, 7, 11]}

params = list(product(*parameters.values()))

best_score = 0
best_params = None
scores = []

k_splits = 10
cv = StratifiedKFold(n_splits=k_splits)

for p in tqdm(params):
    k, kernel, g, n = p
    select = SelectKBest(chi2, k=k)
    select.fit(videos_not_nan_train, y_not_nan_train)
    x = select.transform(videos_not_nan_train)
    mean_score = 0
    for i, (train, val) in enumerate(cv.split(x, y_not_nan_train)):
        x_train = x_not_nan_train[train]
        y_train = y_not_nan_train[train]

        x_val = x_not_nan_train[val]
        y_val = y_not_nan_train[val]

        model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
        model.fit(x_train, y_train)
        y = model.predict(x_val)

        score = f1_score(y_val, y, average="macro")
        mean_score += score
    mean_score /= k_splits
    scores.append((mean_score, p))
    if mean_score > best_score:
        best_score = mean_score
        best_params = p

print(f"Best params: {best_params}")
print(f"BestScore: {best_score}")
print(f"Scores: {scores}")


All: 9623
Tfidf: (9623, 200)
All: 9623
Tfidf: (9623, 200)
All: 9623
Tfidf: (9623, 200)
(8735, 200)
(8735, 200)
(8735, 200)


HBox(children=(FloatProgress(value=0.0, max=760.0), HTML(value='')))


Best params: (5, 'knn', 1, 3)
BestScore: 0.0658480257515066
Scores: [(0.0658480257515066, (5, 'knn', 1, 3)), (0.05797094087387562, (5, 'knn', 1, 5)), (0.05631689353257572, (5, 'knn', 1, 7)), (0.05575569235785878, (5, 'knn', 1, 11)), (0.0658480257515066, (5, 'knn', 10, 3)), (0.05797094087387562, (5, 'knn', 10, 5)), (0.05631689353257572, (5, 'knn', 10, 7)), (0.05575569235785878, (5, 'knn', 10, 11)), (0.0658480257515066, (5, 'knn', 20, 3)), (0.05797094087387562, (5, 'knn', 20, 5)), (0.05631689353257572, (5, 'knn', 20, 7)), (0.05575569235785878, (5, 'knn', 20, 11)), (0.0658480257515066, (5, 'knn', 30, 3)), (0.05797094087387562, (5, 'knn', 30, 5)), (0.05631689353257572, (5, 'knn', 30, 7)), (0.05575569235785878, (5, 'knn', 30, 11)), (0.0658480257515066, (5, 'knn', 40, 3)), (0.05797094087387562, (5, 'knn', 40, 5)), (0.05631689353257572, (5, 'knn', 40, 7)), (0.05575569235785878, (5, 'knn', 40, 11)), (0.02684443301069962, (5, 'rbf', 1, 3)), (0.02684443301069962, (5, 'rbf', 1, 5)), (0.026844433

**Trenowanie modelu z TfIdf**

In [171]:
k, kernel, g, n = best_params

select = SelectKBest(chi2, k=k)
select.fit(videos_not_nan_train, y_not_nan_train)
x_not_nan_train = select.transform(videos_not_nan_train)
x_not_nan_test = select.transform(videos_not_nan_test)
x_only_nan_unlabeled = select.transform(videos_only_nan)
model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
model.fit(x_not_nan_train, y_not_nan_train)

print("Test results")
y_test = model.predict(x_not_nan_test)
show_stats_labels(y_not_nan_test, y_test)
print("\n\n")
print("Unlabeled results")
y = model.predict(x_only_nan_unlabeled)
show_stats_labels(y_only_nan.to_numpy(), y)

Test results
Accuracy: 0.33187134502923976
F1: 0.22345390428182255
Precision: 0.29737273015050797
Recall: 0.23332677737149282
ROC AUC Score OVR: 0.5879608704626896
ROC AUC Score OVO: 0.5879608704626896



Unlabeled results
Accuracy: 0.3112295901366211
F1: 0.21248858493451167
Precision: 0.31678929401710293
Recall: 0.21325481832596696
ROC AUC Score OVR: 0.5789400490264088
ROC AUC Score OVO: 0.5789400490264088


W przeciwieństwie do pierwszego modelu wytrenowanego
tym algorytmem tutaj można zauważyć znaczne przeuczenie się
modelu do danych, które posiadają przypisane kategorie.
Brak zdolności generalizacji przekłada się na znaczne pogorszenie
wyników na zbiorze, w którym kategorie są nieznane.