# YouTube trending videos

In [5]:
%matplotlib inline

import warnings

warnings.filterwarnings('ignore')

# import sys
# sys.path.append('..')


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itertools import product
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import accuracy_score, f1_score, precision_score, silhouette_score, roc_auc_score, recall_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.semi_supervised import LabelSpreading

from helpers.categories import get_categories_dict
from helpers.files import load_csv

## Etap 4 - Wykorzystanie uczenia pół-nadzorowanego


Deklaracja potrzebnych funkcji

In [6]:

def show_stats(y_hat, y_hat_nans, y, mapped: bool = False):
    if not mapped:
        mapping_dict = get_mapping_dict(y_hat_nans, y)
        print(mapping_dict)
        y_mapped = pd.Series(y).apply(lambda x: mapping_dict[x])
    else:
        y_mapped = pd.Series(y.astype(np.int))
        y_hat = y_hat.astype(np.int)
    print(f"Accuracy: {accuracy_score(y_hat, y_mapped)}")
    print(f"F1: {f1_score(y_hat, y_mapped, average='macro')}")
    print(f"Precision: {precision_score(y_hat, y_mapped, average='macro')}")
    print(f"Recall: {recall_score(y_hat, y_mapped, average='macro')}")
    print(f"Hat unique:{np.unique(y_hat.to_numpy())}")
    print(f"labeled unique: {np.unique(y_mapped.to_numpy())}")
    print(f"y_hat_nans : {np.unique(y_hat_nans[y_hat_nans.notna()].to_numpy())}")
    y_hat = y_hat.to_numpy().reshape(-1, 1)
    y_mapped = y_mapped.to_numpy().reshape(-1, 1)
    encoder = OneHotEncoder(sparse=False)
    one_hot_hat = encoder.fit_transform(y_hat)
    y_hot = encoder.transform(y_mapped)
    print(f"ROC AUC Score OVR: {roc_auc_score(one_hot_hat, y_hot, average='micro', multi_class='ovr')}")
    print(f"ROC AUC Score OVO: {roc_auc_score(one_hot_hat, y_hot, multi_class='ovo')}")

### Wyczytanie danych

In [7]:
gb_data, us_data = load_csv("clustering_data")

videos = pd.concat([gb_data, us_data])
# inconsistency in data
videos["category_id"] = videos["category_id"].replace(43.0, 24.0)

categories_ids = videos["new_category_id"].dropna().unique().tolist()

### Wczytanie nazw kategorii

In [8]:
categories_dict = get_categories_dict()
categories = [categories_dict[cat] for cat in categories_ids]

## Usunięcie danych tekstowych

In [9]:
not_nan_bool = videos["new_category_id"].notna().reset_index(drop=True)

videos = videos[videos["new_category_id"].notna()]
videos = videos.reset_index(drop=True)
y_hat = videos["new_category_id"]  # oczekiwane kategorie z api
y_hat_nans = videos["category_id"]  # oczekiwane kategorie z nanami (z oryginalnego zbioru)
videos_no_nan_categories = videos[videos["category_id"].notna()]
selected_columns = [
    "views", "likes", "dislikes", "comment_count", "description_len", "title_len", "channel_title_len",
    "publish_time_day_of_week", "publish_time_hour_of_day",
    "gray_mean_score", "color_mean_score", "gray_hist_score",
    "red_hist_score", "green_hist_score", "blue_hist_score", "edges_score", "entropy_score",
]

for cat in categories_ids:
    selected_columns.append(f"freq_channel_titles_{categories_dict[cat]}")
    selected_columns.append(f"freq_titles_{categories_dict[cat]}")
    selected_columns.append(f"freq_tags_{categories_dict[cat]}")
    selected_columns.append(f"freq_descriptions_{categories_dict[cat]}")

videos = videos[selected_columns]

### Uzupełnienie wartości pustych wartościami średnimi


In [10]:
videos = videos.fillna(videos.mean())

### Skalowanie danych

In [11]:
scaler = MinMaxScaler()
videos = scaler.fit_transform(videos)

### Przygotowanie zbioru z oznaczonymi kategoriami

In [12]:
x_not_nan = videos[y_hat_nans.notna()]
y_not_nan = y_hat_nans[y_hat_nans.notna()]

x_only_nan = videos[y_hat_nans.isna()]
y_only_nan = y_hat[y_hat_nans.isna()]

### K-means

Dobre acc (przypisuje wszyskim jedną klasę ale słaba reszta metryk)
można zastosować bo znamy liczbę klas, do których będziemy przypisywać.

In [13]:
def get_mapping_dict(y_hat_nans, y_pred) -> dict:
    values_counts = y_hat_nans.value_counts().reset_index(name="count")
    # print(values_counts)
    # print(f"Index: {values_counts.index}")
    most_frequent_class = values_counts["index"].iloc[0]
    # print(most_frequent_class)
    mapping_dict = {}
    y_data = pd.DataFrame()
    y_data["y_hat_nans"] = y_hat_nans
    y_data["y"] = y_pred
    y_data["c"] = 1
    y_data = y_data.groupby(["y", "y_hat_nans"])["c"].sum()  #.max(level=[0])#.sort_values().groupby(level=0)
    # print(y_data)
    out = y_data.loc[y_data.groupby(level=0).idxmax()]
    # print(out)
    for row in out.index:
        mapping_dict[row[0]] = row[1]
    for i in range(np.unique(y_pred).shape[0]):
        if i not in mapping_dict.keys():
            mapping_dict[i] = most_frequent_class
    return mapping_dict

In [240]:
# k_list = np.arange(2, 20)
# inertias = np.zeros_like(k_list, dtype=np.float)
# silhouettes = np.zeros_like(k_list, dtype=np.float)
# for i, k in enumerate(k_list):
#     model = KMeans(k)
#     # model.fit(x)
#     labels = model.fit_predict(x)
#     inertias[i] = model.inertia_
#     silhouettes[i] = silhouette_score(x, labels)
#
# plt.plot(k_list, inertias)
# plt.title("Interias")
# plt.show()
# plt.plot(k_list, silhouettes)
# plt.title("Silhouette")
# print(f"Cat len: {len(categories)}")
# model = KMeans(len(categories))
# model.fit(x)
# y = model.predict(x)
# print(y)
# show_stats(y_hat, y_hat_nans, y)


Semi supervised Constrained KMeans (punkt 3.1 z https://arxiv.org/pdf/1806.01547v2.pdf)

In [62]:
select = SelectKBest(k=45)
select.fit(x_not_nan, y_not_nan)

groups_index = np.unique(y_not_nan)
groups_dict = {e: i for i, e in enumerate(groups_index)}
num_groups = len(groups_index)

groups = [[] for _ in range(num_groups)]
for xp, yp in zip(x_not_nan, y_not_nan):
    groups[groups_dict[yp]].append(xp)

centroids = []
for g in groups:
    centroids.append(np.mean(g, axis=0))

centroids = np.array(centroids)
print(centroids.shape)


(15, 77)


In [63]:
new_centroids = centroids[:]
for itr in range(300):
    model = KMeans(n_clusters=num_groups, init=new_centroids, n_init=1, max_iter=1)
    labels = model.fit_predict(x_only_nan)
    new_groups = [[] for _ in range(num_groups)]
    for i, l in enumerate(labels):
        new_groups[l].append(x_only_nan[i])

    new_centroids = []
    for i, g in enumerate(new_groups):
        g = g + groups[i]
        new_centroids.append(np.mean(g, axis=0))
    new_centroids = np.array(new_centroids)
# TODO
# y_pred = model.predict(x_not_nan)
# y = model.predict(x_only_nan)
y = model.predict(videos)
for i,e in zip(range(len(y)),y_hat_nans.notna()):
    if e == True:
        y[i] = y_hat_nans[i]
show_stats(y_hat, y_hat_nans, y)


{1: 1.0, 2: 2.0, 10: 10.0, 15: 15.0, 17: 17.0, 19: 19.0, 20: 20.0, 22: 22.0, 23: 23.0, 24: 24.0, 25: 25.0, 26: 26.0, 27: 27.0, 28: 28.0, 29: 29.0, 0: 10.0, 3: 10.0, 4: 10.0, 5: 10.0, 6: 10.0, 7: 10.0, 8: 10.0, 9: 10.0, 11: 10.0, 12: 10.0, 13: 10.0, 14: 10.0, 16: 10.0, 18: 10.0, 21: 10.0}
Accuracy: 0.38946765884373213
F1: 0.4171949647412629
Precision: 0.8526385183407648
Recall: 0.3277318087400635
Hat unique:[ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
labeled unique: [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
y_hat_nans : [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
ROC AUC Score OVR: 0.6729291029519995
ROC AUC Score OVO: 0.639332942495129


Losowe K-means z wykorzystaniem wiedzy o przybliżonych pozycjach centroidów

In [65]:
best_model = None
best_score = 0.0
for itr in range(300):
    centroids_prim = centroids + np.random.normal(scale = (np.max(centroids) - np.min(centroids))/100,size = centroids.shape)
    model = KMeans(n_clusters=num_groups, init=centroids_prim, n_init=1)
    y = model.fit_predict(videos)
    mapping_dict = get_mapping_dict(y_hat_nans, y)
    y_mapped = pd.Series(y).apply(lambda x: mapping_dict[x])
    score = f1_score(y_hat, y_mapped, average='macro')
    # print(score)
    if best_score < score:
        best_score = score
        best_model = model
labels = best_model.predict(videos)
show_stats(y_hat, y_hat_nans, labels)

{0: 1.0, 1: 19.0, 2: 10.0, 3: 15.0, 4: 17.0, 5: 25.0, 6: 10.0, 7: 24.0, 8: 24.0, 9: 24.0, 10: 25.0, 11: 24.0, 12: 27.0, 13: 28.0, 14: 24.0}
Accuracy: 0.43239839725243273
F1: 0.29155819778977404
Precision: 0.3825734818986265
Recall: 0.27438792859025246
Hat unique:[ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
labeled unique: [ 1. 10. 15. 17. 19. 24. 25. 27. 28.]
y_hat_nans : [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
ROC AUC Score OVR: 0.6959277128138033
ROC AUC Score OVO: 0.612760537139009


AHC - bo mamy zdefiniowaną liczbę grup?

In [244]:
# model = AgglomerativeClustering(len(categories), compute_full_tree=True)
# # model = DBSCAN(eps=0.6, min_samples=5)
# model.fit(x)
# y = model.labels_
#
# show_stats(y_hat, y_hat_nans, y)

DBSCAN - bo mamy grupy różnej wielkości ?

In [245]:
# model = DBSCAN(eps=0.1, min_samples=5)
# model.fit(x)
# y = model.labels_
# show_stats(y_hat, y_hat_nans, y)

### DBSCAN eps Tuning

In [246]:
# nn = NearestNeighbors(n_neighbors=11)
# neighbors = nn.fit(x)
# dist, ind = neighbors.kneighbors()
#
# dist = np.sort(dist[:, 10], axis=0)
#
# plt.plot(dist)
# plt.xlabel("Points")
# plt.ylabel("Dist")
# plt.show()

### Label Spreading

TODO add cross validation as in TFidf

**Przygotowanie parametrów**

In [247]:
parameters = {'k': np.arange(5, x_not_nan.shape[1], 5), 'kernel': ["knn", "rbf"], 'gamma': [1, 10, 20, 30, 40],
              'n_neighbors': [3, 5, 7, 11]}
params = list(product(*parameters.values()))

**Funkcja do wyświetlania statystyk**

In [248]:
def show_stats_labels(y_hat, y):
    print(f"Accuracy: {accuracy_score(y_hat, y)}")
    print(f"F1: {f1_score(y_hat, y, average='macro')}")
    print(f"Precision: {precision_score(y_hat, y, average='macro')}")
    print(f"Recall: {recall_score(y_hat, y, average='macro')}")
    y_hat = y_hat.to_numpy().reshape(-1, 1)
    y = y.reshape(-1, 1)
    encoder = OneHotEncoder(sparse=False)
    one_hot_hat = encoder.fit_transform(y_hat)
    y_hot = encoder.transform(y)
    print(f"ROC AUC Score OVR: {roc_auc_score(one_hot_hat, y_hot, multi_class='ovr')}")
    print(f"ROC AUC Score OVO: {roc_auc_score(one_hot_hat, y_hot, multi_class='ovo')}")

**Strojenie parametrów**

Parametry stroimy przy użyciu walidacji krzyżowej. Jako danych
używamy pełnego zbioru filmów, które w oryginalnym zestawie
danych miały przypisane kategorie filmów. Miara F1 jest
używana do oceny wytrenowanych modeli, która następnie
uśredniana jest po 10 przebiegach walidacji.


In [249]:
best_score = 0
best_params = None
scores = []

k_splits = 10
cv = StratifiedKFold(n_splits=k_splits)

for p in tqdm(params):
    k, kernel, g, n = p
    select = SelectKBest(chi2, k=k)
    select.fit(x_not_nan, y_not_nan)
    x = select.transform(x_not_nan)
    mean_score = 0
    for i, (train, test) in enumerate(cv.split(x, y_not_nan)):
        x_train = x[train]
        y_train = y_not_nan.to_numpy()[train]
        x_test = x[test]
        y_test = y_not_nan.to_numpy()[test]
        model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
        model.fit(x_train, y_train)
        y = model.predict(x_test)
        score = f1_score(y_test, y, average="macro")
        mean_score += score
    mean_score /= k_splits
    scores.append((mean_score, p))
    if mean_score > best_score:
        best_score = mean_score
        best_params = p


print(f"Best params: {best_params}")
print(f"BestScore: {best_score}")
print(f"Scores: {scores}")


HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))


Best params: (40, 'rbf', 40, 3)
BestScore: 0.6647333160312147
Scores: [(0.2130761026314974, (5, 'knn', 1, 3)), (0.22037556905168976, (5, 'knn', 1, 5)), (0.21890495325520498, (5, 'knn', 1, 7)), (0.22224255656773512, (5, 'knn', 1, 11)), (0.2130761026314974, (5, 'knn', 10, 3)), (0.22037556905168976, (5, 'knn', 10, 5)), (0.21890495325520498, (5, 'knn', 10, 7)), (0.22224255656773512, (5, 'knn', 10, 11)), (0.2130761026314974, (5, 'knn', 20, 3)), (0.22037556905168976, (5, 'knn', 20, 5)), (0.21890495325520498, (5, 'knn', 20, 7)), (0.22224255656773512, (5, 'knn', 20, 11)), (0.2130761026314974, (5, 'knn', 30, 3)), (0.22037556905168976, (5, 'knn', 30, 5)), (0.21890495325520498, (5, 'knn', 30, 7)), (0.22224255656773512, (5, 'knn', 30, 11)), (0.2130761026314974, (5, 'knn', 40, 3)), (0.22037556905168976, (5, 'knn', 40, 5)), (0.21890495325520498, (5, 'knn', 40, 7)), (0.22224255656773512, (5, 'knn', 40, 11)), (0.027137808719677154, (5, 'rbf', 1, 3)), (0.027137808719677154, (5, 'rbf', 1, 5)), (0.02713

**Trenowanie i testowanie ostatecznego modelu**

In [250]:
k, kernel, g, n = best_params

select = SelectKBest(chi2, k=k)
select.fit(x_not_nan, y_not_nan)
x_not_nan = select.transform(x_not_nan)
x_only_nan = select.transform(x_only_nan)
model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
model.fit(x_not_nan, y_not_nan)
print(f"X not nan {x_not_nan.shape}")
print(f"classes: {model.classes_}")
y = model.predict(x_only_nan)
# show_stats(y_hat, y_hat_nans, y, True)
show_stats_labels(y_only_nan, y)

X not nan (2733, 40)
classes: [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
Accuracy: 0.6399533488837055
F1: 0.5271166327661649
Precision: 0.6667983671873589
Recall: 0.4706986406569447
ROC AUC Score OVR: 0.7201944223965345
ROC AUC Score OVO: 0.7201944223965345


## Dodatnie TfIdf


In [251]:
from ped4.tfidf import load_videos_with_tf_idf
videos = load_videos_with_tf_idf()

videos_not_nan = videos[y_hat_nans.notna()]
y_not_nan = y_hat_nans[y_hat_nans.notna()]

videos_only_nan = videos[y_hat_nans.isna()]

parameters = {'k': np.arange(5, 100, 5), 'kernel': ["knn", "rbf"], 'gamma': [1, 10, 20, 30, 40],
              'n_neighbors': [3, 5, 7, 11]}

params = list(product(*parameters.values()))

best_score = 0
best_params = None
scores = []

k_splits = 10
cv = StratifiedKFold(n_splits=k_splits)

for p in tqdm(params):
    k, kernel, g, n = p
    select = SelectKBest(chi2, k=k)
    select.fit(videos_not_nan, y_not_nan)
    x = select.transform(videos_not_nan)
    mean_score = 0
    for i, (train, test) in enumerate(cv.split(x, y_not_nan)):
        x_train = x_not_nan[train]
        y_train = y_not_nan.to_numpy()[train]

        x_test = x_not_nan[test]
        y_test = y_not_nan.to_numpy()[test]

        model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
        model.fit(x_train, y_train)
        y = model.predict(x_test)

        score = f1_score(y_test, y, average="macro")
        mean_score += score
    mean_score /= k_splits
    scores.append((mean_score, p))
    if mean_score > best_score:
        best_score = mean_score
        best_params = p

print(f"Best params: {best_params}")
print(f"BestScore: {best_score}")
print(f"Scores: {scores}")


All: 9623
Tfidf: (9623, 200)
All: 9623
Tfidf: (9623, 200)
All: 9623
Tfidf: (9623, 200)
(8735, 200)
(8735, 200)
(8735, 200)


HBox(children=(FloatProgress(value=0.0, max=760.0), HTML(value='')))


Best params: (5, 'rbf', 40, 3)
BestScore: 0.6647333160312147
Scores: [(0.6288725764724508, (5, 'knn', 1, 3)), (0.6215067861722157, (5, 'knn', 1, 5)), (0.6041758632074359, (5, 'knn', 1, 7)), (0.5866027979463361, (5, 'knn', 1, 11)), (0.6288725764724508, (5, 'knn', 10, 3)), (0.6215067861722157, (5, 'knn', 10, 5)), (0.6041758632074359, (5, 'knn', 10, 7)), (0.5866027979463361, (5, 'knn', 10, 11)), (0.6288725764724508, (5, 'knn', 20, 3)), (0.6215067861722157, (5, 'knn', 20, 5)), (0.6041758632074359, (5, 'knn', 20, 7)), (0.5866027979463361, (5, 'knn', 20, 11)), (0.6288725764724508, (5, 'knn', 30, 3)), (0.6215067861722157, (5, 'knn', 30, 5)), (0.6041758632074359, (5, 'knn', 30, 7)), (0.5866027979463361, (5, 'knn', 30, 11)), (0.6288725764724508, (5, 'knn', 40, 3)), (0.6215067861722157, (5, 'knn', 40, 5)), (0.6041758632074359, (5, 'knn', 40, 7)), (0.5866027979463361, (5, 'knn', 40, 11)), (0.06371850865820872, (5, 'rbf', 1, 3)), (0.06371850865820872, (5, 'rbf', 1, 5)), (0.06371850865820872, (5, 

### Trenowanie modelu z TfIdf

In [252]:
k, kernel, g, n = best_params

select = SelectKBest(chi2, k=k)
select.fit(videos_not_nan, y_not_nan)
x_only_nan = select.transform(videos_only_nan)

model = LabelSpreading(kernel=kernel, gamma=g, n_neighbors=n)
model.fit(x_not_nan, y_not_nan)
print(f"classes: {model.classes_}")
y = model.predict(x_only_nan)
show_stats_labels(y_only_nan, y)
# show_stats(y_hat, y_hat_nans, y, True)

X not nan (2733, 5)
X (8735, 5)
classes: [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
Accuracy: 0.3370349170005724
F1: 0.23612985247206633
Precision: 0.2888147085850843
Recall: 0.25470750495958455
Hat unique:[ 1  2 10 15 17 19 20 22 23 24 25 26 27 28 29]
labeled unique: [10 17 19 24 25 27 28]
y_hat_nans : [ 1.  2. 10. 15. 17. 19. 20. 22. 23. 24. 25. 26. 27. 28. 29.]
ROC AUC Score OVR: 0.6448401341074496
ROC AUC Score OVO: 0.6004271133434244
