# Predicting Youtube Video Metrics from Textual Data

### Importing libraries

In [104]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

### Loading dataset

In [308]:
vid = pd.read_csv('./videos-with-comments-aggr.csv',  index_col=0)
vid.head()

Unnamed: 0,id,title,date,keyword,views,likes,comments,comment,mean_comment_likes,mean_comment_sentiment,likes_per_view,comments_per_view,views_above_mean,likes_above_mean,comments_above_mean,days_old,unicode_title,unicode_comment
0,wAZZ-UWGVHI,Apple Pay Is Killing the Physical Wallet After...,2022-08-23,tech,135612.0,3407.0,672.0,Let's not forget that Apple Pay in 2014 requir...,39.1,1.2,0.025123,0.004955,False,False,False,1,Apple Pay Is Killing the Physical Wallet After...,Let's not forget that Apple Pay in 2014 requir...
1,b3x28s61q3c,The most EXPENSIVE thing I own.,2022-08-24,tech,1758063.0,76779.0,4306.0,"Wow, you really went to town on the PSU test r...",598.2,1.8,0.043672,0.002449,False,False,False,0,The most EXPENSIVE thing I own.,"Wow, you really went to town on the PSU test r..."
2,4mgePWWCAmA,My New House Gaming Setup is SICK!,2022-08-23,tech,1564007.0,63825.0,3338.0,Linus!!! Just turn the key lights 180 and bou...,626.2,1.9,0.040809,0.002134,False,False,False,1,My New House Gaming Setup is SICK!,Linus!!! Just turn the key lights 180 and bou...
3,kXiYSI7H2b0,Petrol Vs Liquid Nitrogen | Freezing Experimen...,2022-08-23,tech,922918.0,71566.0,1426.0,Unstoppable experiments with liquid nitrogen 🎉...,528.8,1.6,0.077543,0.001545,False,False,False,1,Petrol Vs Liquid Nitrogen | Freezing Experimen...,Unstoppable experiments with liquid nitrogen 🎉...
4,ErMwWXQxHp0,Best Back to School Tech 2022!,2022-08-08,tech,1855644.0,96513.0,5155.0,"Guys, a quick note that you do NOT need all th...",2721.7,1.6,0.052011,0.002778,False,False,False,16,Best Back to School Tech 2022!,"Guys, a quick note that you do NOT need all th..."


## Predicting whether Videos have Above-Mean Views, Likes, & Comments

# add new models, add multiple splits with mean, add parameter tuning

### 1) Using Count Vectorizer & Multinomial Naive Bayes

In [312]:
pipe = Pipeline([('vect', CountVectorizer()), 
                 ('mnb', MultinomialNB())],
                verbose=True)

lX = [vid.unicode_title, vid.unicode_comment]
ly = [vid.views_above_mean, vid.likes_above_mean, vid.comments_above_mean]
    
for X in lX:
    print('Fitting {} to predict\n'.format(X.name))
    for y in ly:
        print(' - {}:'.format(y.name))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
        pipe.fit(X_train, y_train)
        print('   * Mean score is {} \n'.format(pipe.score(X_test, y_test)))
    print('\nFeatures are {}\n\n {} \n'.format(pipe['vect'].get_feature_names_out(), '.'*50))   

Fitting unicode_title to predict

 - views_above_mean:
[Pipeline] .............. (step 1 of 2) Processing vect, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.0s
   * Mean score is 0.9042553191489362 

 - likes_above_mean:
[Pipeline] .............. (step 1 of 2) Processing vect, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.0s
   * Mean score is 0.9148936170212766 

 - comments_above_mean:
[Pipeline] .............. (step 1 of 2) Processing vect, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.0s
   * Mean score is 0.8537234042553191 


Features are ['00' '000' '000th' ... '핵불닭' '햄버거' 'ｓｌｅｅｐｙ']

 .................................................. 

Fitting unicode_comment to predict

 - views_above_mean:
[Pipeline] .............. (step 1 of 2) Processing vect, total=   0.6s
[Pipeline] ............... (step 2 of 2) Processing mnb, total=   0.0s
   * Mean score is 0.8803191489361702 

