In [70]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import plot_roc_curve, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, linear_model
import statsmodels.api as sm
from scipy import stats
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from gingerit.gingerit import GingerIt
import requests
from bs4 import BeautifulSoup
import json
from nltk.tokenize import word_tokenize
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\zxy\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


#### import and merge data

In [38]:
#import data
comment=pd.read_csv("UScomments.csv",error_bad_lines=False) 
v=pd.read_csv("USvideos.csv",error_bad_lines=False) 

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'
  interactivity=interactivity, compiler=compiler, result=result)
b'Skipping line 2401: expected 11 fields, saw 21\nSkipping line 2800: expected 11 fields, saw 21\nSkipping line 5297: expected 11 fields, saw 12\nSkipping line 5299: expected 11 fields, saw 12\nSkipping line 5300: expected 11 fields, saw 12\nSkipping line 5301: expected 11 fields, saw 12\n'


In [39]:
# sort by video_id and date by ascending order
v.sort_values(['video_id','date'],ascending=[True, True], inplace=True)

In [40]:
# in video dataset, one vedio may have several channel_title or tags, but in comments dataset we can only know this comments is under which video but can't know from which channel
# only keep the video_id with earliest date
v.drop_duplicates(subset ="video_id",keep='first',inplace=True)

In [42]:
#merge comment and vedio dataset
v_c=v.merge(comment,on='video_id')

In [43]:
# import category data
with open('US_category_id.json', 'r') as fp:
    us_category = json.load(fp) 

In [44]:
t=pd.DataFrame(us_category)
item=pd.DataFrame(t['items'])

In [46]:
item['id']=item['items'].apply(lambda x: x['id'])
item['title']=item['items'].apply(lambda x: x['snippet']['title'])

In [47]:
category=item[['id','title']]

In [48]:
# merge vedio_comments dataset with category dataset
category['id_str']=category.id.astype(str)
v_c['category_id_str']=v_c.category_id.astype(str)
vc_f=pd.merge(v_c, category, how='inner', left_on='category_id_str', right_on='id_str')

In [49]:
vc_f.head()

Unnamed: 0,video_id,title_x,channel_title,category_id,tags,views,likes_x,dislikes,comment_total,thumbnail_link,date,comment_text,likes_y,replies,category_id_str,id,title_y,id_str
0,--JinobXWPk,DANGEROUS Jungle Spider!,Brave Wilderness,15,adventure|adventurous|animals|breaking|breakin...,1319945,38949,533,6768,https://i.ytimg.com/vi/--JinobXWPk/default.jpg,20,I saw this wandering spider in our bathroom se...,0,0,15,15,Pets & Animals,15
1,--JinobXWPk,DANGEROUS Jungle Spider!,Brave Wilderness,15,adventure|adventurous|animals|breaking|breakin...,1319945,38949,533,6768,https://i.ytimg.com/vi/--JinobXWPk/default.jpg,20,"Can't you just stick to small ants, and bees? ...",0,0,15,15,Pets & Animals,15
2,--JinobXWPk,DANGEROUS Jungle Spider!,Brave Wilderness,15,adventure|adventurous|animals|breaking|breakin...,1319945,38949,533,6768,https://i.ytimg.com/vi/--JinobXWPk/default.jpg,20,Brazilian wandering spider is the deadliest sp...,0,0,15,15,Pets & Animals,15
3,--JinobXWPk,DANGEROUS Jungle Spider!,Brave Wilderness,15,adventure|adventurous|animals|breaking|breakin...,1319945,38949,533,6768,https://i.ytimg.com/vi/--JinobXWPk/default.jpg,20,Nothing a can of hairspray and lighter couldn'...,0,0,15,15,Pets & Animals,15
4,--JinobXWPk,DANGEROUS Jungle Spider!,Brave Wilderness,15,adventure|adventurous|animals|breaking|breakin...,1319945,38949,533,6768,https://i.ytimg.com/vi/--JinobXWPk/default.jpg,20,That's a snek,0,0,15,15,Pets & Animals,15


In [50]:
# filter YouTube Category: Music, Entertainment, Comedy, People & Blogs, Film & Animation
vcff=vc_f[vc_f['title_y'].isin( ['Music', 'Entertainment', 'Comedy', 'People & Blogs', 'Film & Animation'])]

In [53]:
vcff.head()

Unnamed: 0,video_id,title_x,channel_title,category_id,tags,views,likes_x,dislikes,comment_total,thumbnail_link,date,comment_text,likes_y,replies,category_id_str,id,title_y,id_str
9341,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,Where are the other interviews Gary Chandling ...,0,0,24,24,Entertainment,24
9342,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,My god now tom has passed away too....totally ...,0,0,24,24,Entertainment,24
9343,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,both gone too soon,1,0,24,24,Entertainment,24
9344,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,Chain smoking is bad for your health.,1,0,24,24,Entertainment,24
9345,-5sCWsLlTCI,SNL Host Kumail Nanjiani and P!nk Share Favori...,Saturday Night Live,24,saturday night live|snl|snl season 43|kumail n...,85052,1458,97,132,https://i.ytimg.com/vi/-5sCWsLlTCI/default.jpg,14.0,Week 1: Kate puts her face in Ryan Gosling's a...,205,6,24,24,Entertainment,24


#### clean data

##### Part1: build an abbreviation map

In [55]:
Abbr_dict={}
for char in 'abcdefghijklmnopqrstuvwxyz':
    URL = 'https://www.noslang.com/dictionary/'+char
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    for i in soup.find_all('div',{'class':'dictionary-word'}):
        title_elem = i.find('abbr', class_='dictonary-abbr')
        key=title_elem.text[:-2]
        value=i.find('abbr')['title']
        Abbr_dict[key]=value

In [59]:
def corit(string):    
    return ' '.join(Abbr_dict.get(word,word) for word in string.split(' '))

In [60]:
sample= 'Bro y didnt u give merch to johannes he is ur boy 2'
print(corit(sample))

Bro why didnt you give merch to johannes he is your boy 2


In [61]:
#replace abbreviation with full word
vcff['clean_comment']=vcff['comment_text'].dropna().apply(corit)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


##### Part2: remove non english words

In [65]:
words = set(nltk.corpus.words.words())
punc='"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~\n?!'

In [66]:
def removef(string):
    tokens=[w for w in word_tokenize(string.lower()) if w in words or w in punc]
    return ' '.join(tokens)

In [67]:
sample2='Zajebistaa...jak zawsze,live super!!!!!'
removef(sample2)

', live super ! ! ! ! !'

In [68]:
#remove non-English words
vcff['clean_Engcomment']=vcff['clean_comment'].dropna().apply(removef)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [69]:
vcff.head()

Unnamed: 0,video_id,title_x,channel_title,category_id,tags,views,likes_x,dislikes,comment_total,thumbnail_link,date,comment_text,likes_y,replies,category_id_str,id,title_y,id_str,clean_comment,clean_Engcomment
9341,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,Where are the other interviews Gary Chandling ...,0,0,24,24,Entertainment,24,Where are the other interviews Gary Chandling ...,where are the other did ?
9342,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,My god now tom has passed away too....totally ...,0,0,24,24,Entertainment,24,My god now tom has passed away too....totally ...,my god now away too totally broken i am .
9343,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,both gone too soon,1,0,24,24,Entertainment,24,both gone too soon,both gone too soon
9344,-3AGlBYyLjo,Best Tom Petty Interview Ever,CrazyLaughAction,24,tom|petty|tom petty|Tom Petty|Tom Petty (Music...,2143,16,2,4,https://i.ytimg.com/vi/-3AGlBYyLjo/default.jpg,6.1,Chain smoking is bad for your health.,1,0,24,24,Entertainment,24,Chain smoking is bad for your health.,chain smoking is bad for your health .
9345,-5sCWsLlTCI,SNL Host Kumail Nanjiani and P!nk Share Favori...,Saturday Night Live,24,saturday night live|snl|snl season 43|kumail n...,85052,1458,97,132,https://i.ytimg.com/vi/-5sCWsLlTCI/default.jpg,14.0,Week 1: Kate puts her face in Ryan Gosling's a...,205,6,24,24,Entertainment,24,Week 1: Kate puts her face in Ryan Gosling's a...,week : her face in gosling : she out with gal ...


#### Sentiment score analysis

In [71]:
#define function to get sentiment score including positive negative neutral compound scores
def setPolarity(a):
    score=sid.polarity_scores(a)
    return score

In [72]:
vcff['P_score']=vcff['clean_Engcomment'].dropna().apply(setPolarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [73]:
def getvalue(d):
    t=d['neg']
    return t
def getvalue_p(d):
    t=d['pos']
    return t
def getvalue_neu(d):
    t=d['neu']
    return t
def getvalue_com(d):
    t=d['compound']
    return t

In [74]:
vcff['P_score_neg']=vcff['P_score'].dropna().apply(getvalue)
vcff['P_score_pos']=vcff['P_score'].dropna().apply(getvalue_p)
vcff['P_score_neu']=vcff['P_score'].dropna().apply(getvalue_neu)
vcff['P_score_com']=vcff['P_score'].dropna().apply(getvalue_com)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [76]:
#calculate video title sentiment score
vcff['video_title_score']=vcff['title_x'].dropna().apply(setPolarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [77]:
#compond score of video title
vcff['video_title_compond_s']=vcff['video_title_score'].dropna().apply(getvalue_com)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [78]:
#calculate number of tag
vcff['num_tags']=vcff['tags'].apply(lambda x: len(x.split('|')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [79]:
reg_d=vcff[['video_id','video_title_compond_s','num_tags','views','likes_x','dislikes','comment_total','replies','P_score_com','P_score_neg', 'P_score_pos', 'P_score_neu']]

In [80]:
reg_d.head()

Unnamed: 0,video_id,video_title_compond_s,num_tags,views,likes_x,dislikes,comment_total,replies,P_score_com,P_score_neg,P_score_pos,P_score_neu
9341,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,0.0,0.0,0.0,1.0
9342,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,-0.32,0.29,0.18,0.52
9343,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,0.0,0.0,0.0,1.0
9344,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,-0.54,0.37,0.0,0.63
9345,-5sCWsLlTCI,0.67,32,85052,1458,97,132,6,0.0,0.0,0.0,1.0


In [81]:
# clean data for poisson regression
reg_d=reg_d[reg_d.replies!='replies']
reg_d['replies']=reg_d.replies.astype(int)

In [82]:
reg_d.dtypes

video_id                  object
video_title_compond_s    float64
num_tags                   int64
views                      int64
likes_x                    int64
dislikes                   int64
comment_total              int64
replies                    int32
P_score_com              float64
P_score_neg              float64
P_score_pos              float64
P_score_neu              float64
dtype: object

In [83]:
reg_d=reg_d.dropna()

In [102]:
reg_d.head()

Unnamed: 0,video_id,video_title_compond_s,num_tags,views,likes_x,dislikes,comment_total,replies,P_score_com,P_score_neg,P_score_pos,P_score_neu
9341,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,0.0,0.0,0.0,1.0
9342,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,-0.32,0.29,0.18,0.52
9343,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,0.0,0.0,0.0,1.0
9344,-3AGlBYyLjo,0.53,6,2143,16,2,4,0,-0.54,0.37,0.0,0.63
9345,-5sCWsLlTCI,0.67,32,85052,1458,97,132,6,0.0,0.0,0.0,1.0


In [85]:
# average the sentiment score (other columns will not change)
r_avg=reg_d.groupby('video_id').mean()

In [97]:
#sum the number replies of comments
t=reg_d.groupby('video_id').sum()

In [98]:
r_avg['replies_sum']=t['replies']

In [104]:
r_avg.drop("replies", axis=1,inplace=True)

In [105]:
r_avg.dtypes

video_title_compond_s    float64
num_tags                   int64
views                      int64
likes_x                    int64
dislikes                   int64
comment_total              int64
P_score_com              float64
P_score_neg              float64
P_score_pos              float64
P_score_neu              float64
replies_sum                int32
dtype: object

In [106]:
r_avg.head()

Unnamed: 0_level_0,video_title_compond_s,num_tags,views,likes_x,dislikes,comment_total,P_score_com,P_score_neg,P_score_pos,P_score_neu,replies_sum
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
-3AGlBYyLjo,0.53,6,2143,16,2,4,-0.21,0.17,0.046,0.79,0
-5sCWsLlTCI,0.67,32,85052,1458,97,132,0.16,0.063,0.2,0.66,39
-6Zc8Co2H3w,0.51,15,1956813,49656,1287,6894,0.22,0.049,0.21,0.72,1
-AJyaVduxCc,0.0,8,313279,1660,56,123,0.073,0.1,0.13,0.71,590
-B9z3az6Axc,0.0,26,106224,10782,224,2154,0.18,0.076,0.21,0.68,79


In [107]:
predictors=['video_title_compond_s','num_tags','likes_x','dislikes','comment_total','replies_sum','P_score_com','P_score_neg', 'P_score_pos', 'P_score_neu']
outcome='views'

X = r_avg[predictors]
y = r_avg[outcome]

#split training and testing dataset
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.2, random_state=1)

#build Poisson regression model
X3=sm.add_constant(valid_X)
est3=sm.GLM(valid_y,X3.astype(float),family=sm.families.Poisson())
est4=est3.fit()
print(est4.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  views   No. Observations:                  268
Model:                            GLM   Df Residuals:                      257
Model Family:                 Poisson   Df Model:                           10
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.3098e+07
Date:                Tue, 09 Mar 2021   Deviance:                   1.4619e+08
Time:                        21:59:25   Pearson chi2:                 2.25e+08
No. Iterations:                    16                                         
Covariance Type:            nonrobust                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    16.36

In [108]:
sst_val = sum(map(lambda x: np.power(x,2),y-np.mean(y))) 
sse_val = sum(map(lambda x: np.power(x,2),est4.resid_response)) 
r2 = 1.0 - sse_val/sst_val

In [109]:
r2

0.9545301283378864