In [65]:
import json
import math 
import os
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from word2vec import get_vector

from eden_ai import get_labels

In [66]:
no_of_bins = 20

videos = {}
with open('videos.json', 'r') as f: 
    videos = json.load(f)

x = []
y = []

for video in videos.values():
    print(type(video), video)

    description = video["video"]["description"].split() or []
    tags = video["video"].get('tags', []) or []
    labels = [inner["label"] for inner in video["video"]["thumbnail_objects"]] or []

    words = description + tags + labels
    x.append(get_vector(words))

    view = video['video']['view_count']
    sub_count = video['channel']['subscriber_count']

    y.append(int(math.tanh(math.log(view) / math.log(sub_count)) * no_of_bins))

<class 'dict'> {'video': {'title': 'Why fake punches in movies look real', 'description': "Punching in movies, explained by someone getting punched repeatedly.\n\nSubscribe and turn on notifications 🔔  so you don't miss any videos: http://goo.gl/0bsAjO \n\nMovie fight scenes at their...", 'duration': '6:46', 'publish_date': '2023-05-29T12:00:06Z', 'thumbnails': {'url': 'https://i.ytimg.com/vi/7DlYBJzAo6k/maxresdefault.jpg', 'width': 1280, 'height': 720}, 'video_id': '7DlYBJzAo6k', 'view_count': 88779, 'tags': ['Vox.com', 'action films', 'action movies', 'atomic blonds', 'cgi', 'cinema', 'explain', 'explainer', 'fight scenes', 'movies', 'stunt doubles', 'vfx', 'visual effects', 'vox', 'fight scenes in movies', 'john wick', 'atomic blond', 'creed', 'bullet train', 'craft', 'choreography', 'fight choreography', 'fighting sequence', 'wade eastwood', 'second coordinator', 'second unit', 'stunt coordinator', 'on-screen', 'film'], 'category': '25', 'thumbnail_objects': [{'label': 'Person', 'c

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [68]:
# Create an instance of the Random Forest Regression model
model = RandomForestRegressor()

# Train the model on the training data
model.fit(x_train, y_train)

RandomForestRegressor()

In [69]:
# Make predictions on the test data
y_pred = model.predict(x_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

for index in range(len(x_test)):
    print(index, y_pred[index], y_test[index])

Mean Squared Error: 0.7152157303370785
0 13.16 11
1 12.55 13
2 13.55 13
3 12.89 13
4 13.57 14
5 13.09 14
6 12.99 13
7 12.8 14
8 13.01 14
9 13.28 12
10 13.31 13
11 13.43 13
12 13.15 14
13 13.63 14
14 12.7 14
15 12.94 13
16 13.01 12
17 12.8 13
18 13.51 14
19 13.46 14
20 13.28 13
21 13.3 14
22 12.47 12
23 13.05 12
24 13.49 13
25 12.75 12
26 12.83 13
27 13.03 13
28 13.21 12
29 13.21 14
30 12.99 14
31 13.74 14
32 13.08 14
33 12.95 12
34 12.78 12
35 12.95 13
36 13.41 14
37 13.14 14
38 12.8 14
39 13.11 12
40 12.69 13
41 13.0 13
42 12.87 13
43 13.05 13
44 12.97 14
45 13.57 14
46 13.59 14
47 13.08 14
48 12.69 14
49 12.97 13
50 13.23 13
51 13.11 15
52 13.31 13
53 12.67 13
54 13.07 13
55 13.45 14
56 12.7 13
57 13.07 12
58 12.94 14
59 13.18 14
60 12.78 13
61 12.78 12
62 13.2 12
63 13.64 13
64 13.11 13
65 12.41 13
66 13.26 13
67 13.21 13
68 12.82 13
69 12.97 14
70 13.01 14
71 12.68 13
72 13.32 14
73 13.12 14
74 12.65 14
75 13.4 13
76 13.25 14
77 12.9 13
78 13.12 12
79 12.83 14
80 13.05 10
81 12.84 

In [70]:
def predict(tags, description, thumbnail_url):
    labels = [inner['label'] for inner in get_labels(image_url=thumbnail_url)] or []

    words = description + tags + labels
    words_vector = get_vector(words)

    return model.predict([words_vector])[0]

In [71]:
tags = ["duzce", "duzce earthquake", "duzce news", "duzce turkey", "earthquake", 
               "earthquake in turkey", "erdogan", "gdnpfpnewsworld", "golkaya", "golkaya earthquake", 
               "haber", "istanbul", "istanbul earthquake", "istanbul news", "istanbul news anchor", 
               "tgrt", "turkey", "turkey 2022", "turkey earthquake", "turkey earthquake live", "turkey earthquake video", 
               "turkey latest", "turkey latest news", "turkey news", "turkey quake", "turkish anchor", "turkish news reporter", 
               "turkish reporter", "world"]

description = """A magnitude-5.9 earthquake hit a town in northwestern Turkey early Wednesday, causing damage to some buildings and widespread panic. 
Subscribe to Guardian News on YouTube ► http://bit.ly/guardianwiressub

At least 68 people were injured, mostly while trying to flee homes. Footage released by Turkish private broadcaster TGRT shows how night-time anchorman Ersel Hoskara coninued presenting as the quake was felt in Istanbul while the graphic underneath him read 'Breaking news: moment of the earthquake live on air'. The earthquake was centered in the town of Golkaya, in Duzce province, some 200 kilometers (125 miles) east of Istanbul, the Disaster and Emergency Management Presidency said. 

The Guardian publishes independent journalism, made possible by supporters. Contribute to The Guardian today ► https://bit.ly/3uhA7zg

Sign up to the Guardian's free new daily newsletter, First Edition ► http://theguardian.com/first-edition

Website ► https://www.theguardian.com
Facebook ►https://www.facebook.com/theguardian
Twitter ► https://twitter.com/guardian
Instagram ► https://instagram.com/guardian

The Guardian on YouTube:
The Guardian ► https://bit.ly/guardiannewssubs
Guardian Australia ► https://bit.ly/guardianaussubs
Guardian Football ► https://bit.ly/gdnfootballsubs
Guardian Sport ► https://bit.ly/gdnsportsubs
Guardian Live ► https://bit.ly/guardianlivesubs

#Istanbul #News #Earthquake #Istanbul #News""".split()

thumbnail_url = "https://i3.ytimg.com/vi/SP9_USUZn68/maxresdefault.jpg"

y = predict(tags, description, thumbnail_url)
print(y)

13.03
