# Predicting Youtube Video Metrics from Numerical Data

### Importing libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn import linear_model, metrics, tree
from sklearn.model_selection import train_test_split
import string 

### Loading dataset

In [27]:
vid = pd.read_csv('./videos-with-comments-aggr.csv',  index_col=0)
vid.head()

Unnamed: 0,id,title,date,keyword,views,likes,comments,comment,mean_comment_likes,mean_comment_sentiment,likes_per_view,comments_per_view,views_above_mean,likes_above_mean,comments_above_mean,days_old,unicode_title,unicode_comment
0,wAZZ-UWGVHI,Apple Pay Is Killing the Physical Wallet After...,2022-08-23,tech,135612.0,3407.0,672.0,Let's not forget that Apple Pay in 2014 requir...,39.1,1.2,0.025123,0.004955,False,False,False,1,Apple Pay Is Killing the Physical Wallet After...,Let's not forget that Apple Pay in 2014 requir...
1,b3x28s61q3c,The most EXPENSIVE thing I own.,2022-08-24,tech,1758063.0,76779.0,4306.0,"Wow, you really went to town on the PSU test r...",598.2,1.8,0.043672,0.002449,False,False,False,0,The most EXPENSIVE thing I own.,"Wow, you really went to town on the PSU test r..."
2,4mgePWWCAmA,My New House Gaming Setup is SICK!,2022-08-23,tech,1564007.0,63825.0,3338.0,Linus!!! Just turn the key lights 180 and bou...,626.2,1.9,0.040809,0.002134,False,False,False,1,My New House Gaming Setup is SICK!,Linus!!! Just turn the key lights 180 and bou...
3,kXiYSI7H2b0,Petrol Vs Liquid Nitrogen | Freezing Experimen...,2022-08-23,tech,922918.0,71566.0,1426.0,Unstoppable experiments with liquid nitrogen 🎉...,528.8,1.6,0.077543,0.001545,False,False,False,1,Petrol Vs Liquid Nitrogen | Freezing Experimen...,Unstoppable experiments with liquid nitrogen 🎉...
4,ErMwWXQxHp0,Best Back to School Tech 2022!,2022-08-08,tech,1855644.0,96513.0,5155.0,"Guys, a quick note that you do NOT need all th...",2721.7,1.6,0.052011,0.002778,False,False,False,16,Best Back to School Tech 2022!,"Guys, a quick note that you do NOT need all th..."


In [26]:
norm_vid = pd.read_csv('./norm-videos-with-comments-aggr.csv',  index_col=0)
norm_vid.head()

Unnamed: 0,id,title,date,keyword,views,likes,comments,comment,mean_comment_likes,mean_comment_sentiment,likes_per_view,comments_per_view,views_above_mean,likes_above_mean,comments_above_mean,days_old
0,wAZZ-UWGVHI,Apple Pay Is Killing the Physical Wallet After...,2022-08-23,tech,3.4e-05,0.000207,0.000918,Let's not forget that Apple Pay in 2014 requir...,0.000225,0.6,0.115269,0.0598,False,False,False,0.000181
1,b3x28s61q3c,The most EXPENSIVE thing I own.,2022-08-24,tech,0.000436,0.004669,0.005877,"Wow, you really went to town on the PSU test r...",0.003439,0.9,0.200102,0.029772,False,False,False,0.0
2,4mgePWWCAmA,My New House Gaming Setup is SICK!,2022-08-23,tech,0.000388,0.003881,0.004556,Linus!!! Just turn the key lights 180 and bou...,0.0036,0.95,0.187005,0.025998,False,False,False,0.000181
3,kXiYSI7H2b0,Petrol Vs Liquid Nitrogen | Freezing Experimen...,2022-08-23,tech,0.000229,0.004352,0.001947,Unstoppable experiments with liquid nitrogen 🎉...,0.00304,0.8,0.355006,0.018938,False,False,False,0.000181
4,ErMwWXQxHp0,Best Back to School Tech 2022!,2022-08-08,tech,0.00046,0.005869,0.007036,"Guys, a quick note that you do NOT need all th...",0.015645,0.8,0.238235,0.033711,False,False,False,0.0029


## Predicting Total Views from Rate of Likes-per-View and Comments-per-View

# Achieve better fit

### 1) Linear Regression

In [18]:
y = vid[['views']].values
X = vid[['likes_per_view', 'comments_per_view']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=3)

In [19]:
lin = linear_model.LinearRegression()
lin.fit(X_train, y_train)

LinearRegression()

In [20]:
lin.coef_, lin.intercept_

(array([[-2.42979393e+08, -3.97701147e+08]]), array([20472482.18037041]))

In [22]:
lin.predict([[.02, .004]])

array([[14022089.7375879]])

In [23]:
lin.score(X_test, y_test)

0.0037810228731782747

### 2) Decision Tree

In [9]:
tre = tree.DecisionTreeClassifier()

In [10]:
tre.fit(X_train, y_train)

DecisionTreeClassifier()

In [11]:
tre.predict([[.02, .004, 1]])

array([9401.])

In [12]:
tre.predict([[.02, .004, 1]])

array([9401.])

In [18]:
tre.score(X_train, y_train)

1.0

In [13]:
tre.score(X_test, y_test)

0.0035460992907801418

## Other Predictions

### 1) Logistic Regression

In [14]:
log = linear_model.LogisticRegression()

bool_tech = np.array(vid.keyword == 'tech')

log.fit(vid[['views', 'likes', 'comments', 'com_likes', 'com_sentiment']], bool_tech)

LogisticRegression()

In [15]:
pred_tech = log.predict(vid[['views', 'likes', 'comments', 'com_likes', 'com_sentiment']])

In [16]:
log.score(vid[['views', 'likes', 'comments', 'com_likes', 'com_sentiment']], tech)

0.9728579031399681

In [17]:
metrics.precision_recall_fscore_support(bool_tech, pred_tech)

NameError: name 'bool_tech' is not defined