In [1]:
from datetime import datetime
from IPython.display import JSON
from datetime import timedelta
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.stats import norm, skew
from scipy import stats

from Functions import *

import string
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 

import isodate
import nltk
import ast
import re
import langid
import emoji
import nltk

import spacy
nltk.download('punkt')

warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Objective

### 1) Top 10 Videos Based on View Count
### 2) Analyze the Trend in Video Views Over the Years
### 3) Explore the Correlation Between Likes, Comments, and Views
### 4) Investigate the Impact of Video Duration on View Count
### 5) Examine Mr Beast's Video Upload Frequency Over 12 Years
### 6) Identify Frequently Used Words in Mr Beast's Video Titles and Explore Trends
### 7) Assess the Influence of Comments on View Count
### 8) Develop a Machine Learning Model to Predict Video View Count

# 1) Load Both Raw data and comment data 

In [2]:
df = pd.read_csv('Dataset/raw_data.csv')
df.head()

Unnamed: 0,video_id,channelTitle,title,description,publishedAt,tags,viewCount,likeCount,commentCount,duration
0,KOEfDvr4DcQ,MrBeast,"Face Your Biggest Fear To Win $800,000",I can’t believe how far he got\nFace your fear...,2024-02-10T17:00:00Z,,101868832,4025540.0,151759.0,PT22M3S
1,krsBRQbOPQ4,MrBeast,"$1 vs $250,000,000 Private Island!",Watch until the end to see how crazy the last ...,2024-01-27T17:00:00Z,,132353095,4380220.0,97258.0,PT16M59S
2,7ESeQBeikKs,MrBeast,"Protect $500,000 Keep It!",He spent so much money lol\nGrow your money wi...,2024-01-13T17:00:00Z,,110379739,4075544.0,75623.0,PT15M34S
3,K_CbgLpvH9E,MrBeast,I Spent 7 Days In Solitary Confinement,I started going insane at the end of this chal...,2023-12-30T17:00:03Z,,125032838,4230264.0,80321.0,PT20M16S
4,lOKASgtr6kU,MrBeast,I Rescued 100 Abandoned Dogs!,I’m so happy all of these dogs will be going t...,2023-12-23T17:00:00Z,,124742129,4924317.0,94035.0,PT15M3S


In [3]:
df.shape

(776, 10)

In [4]:
df_comment = pd.read_csv('Dataset/raw_youtube_comments.csv')
df_comment = df_comment.drop('Unnamed: 0', axis =1)
df_comment.head()

Unnamed: 0,video_id,comments
0,KOEfDvr4DcQ,['The new Feastables bars will take some time ...
1,krsBRQbOPQ4,['I hope you all enjoy how much we’ve been lev...
2,7ESeQBeikKs,['The new Feastables branding and chocolate fo...
3,K_CbgLpvH9E,"['watch until the end for good luck', 'No cred..."
4,lOKASgtr6kU,['Every family who adopted a dog was fully vet...


In [5]:
df_comment.shape

(774, 2)

## The rows in the raw data and comment data are not consistently matched, requiring us to analyze specific instances within the comment dataset where comments are absent or missing

In [6]:
df_merged = pd.merge(df, df_comment, on='video_id', how='outer', indicator=True)

# Apply left join on the outer dataset 
df_merged[df_merged['_merge'] == 'left_only']

Unnamed: 0,video_id,channelTitle,title,description,publishedAt,tags,viewCount,likeCount,commentCount,duration,comments,_merge
311,AS5CxLCWq-Q,MrBeast,Watching Dance Till You're Dead For 10 Hours,I WATCHED THE WHOLE VIDEO\nEnter $500 Giveaway...,2017-03-21T22:40:12Z,"['dance', 'till', 'youre', 'dead', 'remix', 'm...",53017024,2587660.0,,PT10H1S,,left_only
502,-V3-fhJQZkk,MrBeast,Youtube Collabs? Collabbing with Mr.Beast?,This is something I get asked every single day...,2015-08-01T10:00:01Z,"['Mr.Beast', 'mr', 'beast', 'collabbing', 'on'...",86035,1869.0,215.0,PT3M33S,,left_only


## As evident from the absence of comments on these two videos, a more efficient approach would be to exclude them

In [7]:
df_merged = df_merged[df_merged['_merge'] == 'both']

In [8]:
df_merged.head(1)

Unnamed: 0,video_id,channelTitle,title,description,publishedAt,tags,viewCount,likeCount,commentCount,duration,comments,_merge
0,KOEfDvr4DcQ,MrBeast,"Face Your Biggest Fear To Win $800,000",I can’t believe how far he got\nFace your fear...,2024-02-10T17:00:00Z,,101868832,4025540.0,151759.0,PT22M3S,['The new Feastables bars will take some time ...,both


In [9]:
df_merged.shape

(774, 12)

# 2) Missing Values

In [10]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 774 entries, 0 to 775
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   video_id      774 non-null    object  
 1   channelTitle  774 non-null    object  
 2   title         774 non-null    object  
 3   description   746 non-null    object  
 4   publishedAt   774 non-null    object  
 5   tags          539 non-null    object  
 6   viewCount     774 non-null    int64   
 7   likeCount     767 non-null    float64 
 8   commentCount  774 non-null    float64 
 9   duration      774 non-null    object  
 10  comments      774 non-null    object  
 11  _merge        774 non-null    category
dtypes: category(1), float64(2), int64(1), object(8)
memory usage: 73.4+ KB


## Upon examining the df.info() output, it is evident that we should adjust the data type of specific columns, such as comment count and like count, to integer (INT). Additionally, there are null values in some columns that require cleaning such as tags and likeCount 

In [11]:
df_merged.isnull().sum()

video_id          0
channelTitle      0
title             0
description      28
publishedAt       0
tags            235
viewCount         0
likeCount         7
commentCount      0
duration          0
comments          0
_merge            0
dtype: int64

In [12]:
df_merged.isnull().sum()/ len(df_merged) * 100

video_id         0.000000
channelTitle     0.000000
title            0.000000
description      3.617571
publishedAt      0.000000
tags            30.361757
viewCount        0.000000
likeCount        0.904393
commentCount     0.000000
duration         0.000000
comments         0.000000
_merge           0.000000
dtype: float64

## Checking the absence of values in the like count. My hypothesis is that these videos lack likes. To validate this assumption, I have chosen to inspect the respective YouTube videos of these seven entries. Confirming my hypothesis, these videos indeed have zero likes. 

In [13]:
df_merged[df_merged['likeCount'].isnull()]

Unnamed: 0,video_id,channelTitle,title,description,publishedAt,tags,viewCount,likeCount,commentCount,duration,comments,_merge
529,SgMydsOucFg,MrBeast,INSANE TRIPLE LEGENDARY SUPPLY DROP! BEST OPEN...,This is the best triple legendary supply drop ...,2015-07-11T20:08:50Z,"['mrbeast6000', 'advanced warfare', 'supply dr...",47269,,289.0,PT3M29S,['244번째 댓글입니다 지미 댓글 적으면서 몇번 꿈나라에 갔었다 Challen...,both
758,QicO1Gd0kvU,MrBeast,Pokemon online battle #4 [uu] Same old team,"How's it going, I know you are proboly bored o...",2013-03-28T17:22:43Z,"['pokemon', 'online', 'battle', 'number', 'fou...",179216,,1027.0,PT4M39S,"['S', '❤❤', 'Thanks ❤🎉', 'Pokémen is the plura...",both
759,yFfcsmK2TOA,MrBeast,Online pokemon battle #3 [uu] Ice punch,This is my third battle with this team that i ...,2013-03-27T19:57:08Z,"['pokemon', 'online', 'battle', 'ice', 'punch']",153624,,1360.0,PT3M25S,"['A', 'E', 'Thanks 🎉❤', 'Please mr beast', 'Ca...",both
760,Jl0-6tbNGBo,MrBeast,Pokemon online battle #2 [uu] Herracross w/ Da...,Once i realized he didnt have any one who coul...,2013-03-26T16:58:32Z,"['pokemon', 'online', 'battle', 'herracross', ...",224934,,1230.0,PT2M49S,"['Mr beaast', 'B', 'Waoôoh❤🎉', 'My favorite us...",both
761,IAbX0FwX-GY,MrBeast,Why I Haven't Been Uploading,The truth behind it,2013-03-25T20:10:38Z,"['Pokemon', 'online', 'battle', 'first']",665791,,1723.0,PT6M3S,"['Pokémon showdown', 'Hola', 'This comes to my...",both
767,gchqnwXlxJw,MrBeast,Drinks in minecraft (mod),"This mod adds coke, sprite, and sproke. \n\nmo...",2013-01-26T16:26:34Z,"['epic', 'sniper', 'clip', 'must', 'see', 'min...",441705,,1664.0,PT43S,"['Mr beast 🎉', 'Hello', 'Trop cool ❤', 'Wow', ...",both
774,jP82d277Cc8,MrBeast,Harry Potter Mod In Minecraft! EPIC MUST SEE M...,One of the coolest mods i have ever seen\n\nMo...,2012-03-09T23:29:03Z,"['Harry Potter minecraft', 'minecraft', 'minec...",4434195,,8412.0,PT3M59S,['I remember filming this with my horrible lap...,both


## I conducted some research and discovered that YouTube removed public visibility of tags in 2012. Although video owners can still add tags, these tags are now private and cannot be viewed by others

In [14]:
df_merged[df_merged['tags'].isnull()].head()

Unnamed: 0,video_id,channelTitle,title,description,publishedAt,tags,viewCount,likeCount,commentCount,duration,comments,_merge
0,KOEfDvr4DcQ,MrBeast,"Face Your Biggest Fear To Win $800,000",I can’t believe how far he got\nFace your fear...,2024-02-10T17:00:00Z,,101868832,4025540.0,151759.0,PT22M3S,['The new Feastables bars will take some time ...,both
1,krsBRQbOPQ4,MrBeast,"$1 vs $250,000,000 Private Island!",Watch until the end to see how crazy the last ...,2024-01-27T17:00:00Z,,132353095,4380220.0,97258.0,PT16M59S,['I hope you all enjoy how much we’ve been lev...,both
2,7ESeQBeikKs,MrBeast,"Protect $500,000 Keep It!",He spent so much money lol\nGrow your money wi...,2024-01-13T17:00:00Z,,110379739,4075544.0,75623.0,PT15M34S,['The new Feastables branding and chocolate fo...,both
3,K_CbgLpvH9E,MrBeast,I Spent 7 Days In Solitary Confinement,I started going insane at the end of this chal...,2023-12-30T17:00:03Z,,125032838,4230264.0,80321.0,PT20M16S,"['watch until the end for good luck', 'No cred...",both
4,lOKASgtr6kU,MrBeast,I Rescued 100 Abandoned Dogs!,I’m so happy all of these dogs will be going t...,2023-12-23T17:00:00Z,,124742129,4924317.0,94035.0,PT15M3S,['Every family who adopted a dog was fully vet...,both


In [15]:
# For likecount  missing values, we can fill it with 0 since there are likes and no comment. 
# However, I will remove the tags column since there are 30% missing value and replace it using AI

df_merged['likeCount'] = df_merged['likeCount'].fillna('0')
df_merged = df_merged.drop(['tags', 'description', '_merge'] , axis = 1)

In [16]:
df_merged.isnull().sum()

video_id        0
channelTitle    0
title           0
publishedAt     0
viewCount       0
likeCount       0
commentCount    0
duration        0
comments        0
dtype: int64

In [17]:
df_merged.dtypes

video_id         object
channelTitle     object
title            object
publishedAt      object
viewCount         int64
likeCount        object
commentCount    float64
duration         object
comments         object
dtype: object

# 3) Change Data Type

# Change 'likeCount','commentCount' in the column to numerical

In [18]:
numerical = ['likeCount','commentCount']
df_merged[numerical] = df_merged[numerical].astype('int64')

In [19]:
df_merged.dtypes

video_id        object
channelTitle    object
title           object
publishedAt     object
viewCount        int64
likeCount        int64
commentCount     int64
duration        object
comments        object
dtype: object

# 4) Text data pre-processing

In [20]:
df_merged['title'].head(3)

0    Face Your Biggest Fear To Win $800,000
1        $1 vs $250,000,000 Private Island!
2                 Protect $500,000 Keep It!
Name: title, dtype: object

## Need to tokenize, lower case, replace token start with $ to 'money' and any number to 'number'

## Replace any token start with $ to 'money' and any number to 'number'

In [21]:
df_merged['title'].head(4) # Before Transformation

0    Face Your Biggest Fear To Win $800,000
1        $1 vs $250,000,000 Private Island!
2                 Protect $500,000 Keep It!
3    I Spent 7 Days In Solitary Confinement
Name: title, dtype: object

In [22]:
df_merged['title_clean'] = df_merged['title'].apply(convert_title)
df_merged['title_clean'] = df_merged['title_clean'].apply(tokenize)
df_merged['title_clean'] = df_merged['title_clean'].apply(preprocess_text)
df_merged['title_clean'] = df_merged['title_clean'].apply(lemmatize_tokens)
df_merged['title_clean'] = df_merged['title_clean'].apply(lambda x: ' '.join(x))

In [23]:
df_merged['title_clean'].head(4) # After Transformation

0                  face big fear win money
1            money vs money private island
2                       protect money keep
3    spend number day solitary confinement
Name: title_clean, dtype: object

# 5) Feature Engineering

# Separate the 'publishedAt' column into two distinct columns, one for the date and another for the time

In [24]:
df_merged['date'] = df_merged['publishedAt'].apply(lambda x: x[:10])
df_merged['time'] = df_merged['publishedAt'].apply(lambda x: x[11:19])

In [25]:
df_merged['day'] = df_merged['publishedAt'].apply(days)
df_merged = df_merged.drop(['publishedAt', 'time'], axis = 1)

In [26]:
df_merged['duration(s)'] = df_merged['duration'].apply(duration2second)
df_merged = df_merged.drop('duration', axis = 1)

In [27]:
df_merged.head(1)

Unnamed: 0,video_id,channelTitle,title,viewCount,likeCount,commentCount,comments,title_clean,date,day,duration(s)
0,KOEfDvr4DcQ,MrBeast,"Face Your Biggest Fear To Win $800,000",101868832,4025540,151759,['The new Feastables bars will take some time ...,face big fear win money,2024-02-10,Saturday,1323.0


In [28]:
datetime = ['date']
df_merged['date'] = pd.to_datetime(df_merged['date'], errors='coerce')
df_merged['duration(s)'] = df_merged['duration(s)'].astype('int64')

In [29]:
df_merged.dtypes

video_id                object
channelTitle            object
title                   object
viewCount                int64
likeCount                int64
commentCount             int64
comments                object
title_clean             object
date            datetime64[ns]
day                     object
duration(s)              int64
dtype: object

## Create Month and Years column

In [30]:
df_merged['year'] = df_merged['date'].dt.year
df_merged['month'] = df_merged['date'].dt.month

## Day Different

The amount of day uploaded is also important. For example an video just uploaded 3 days will have a higher chance of getting lesser views than those in 2 years

from datetime import datetime

current_date = pd.to_datetime(datetime.today().strftime('%Y-%m-%d'))
df_merged['diff_date'] = (current_date - df_merged['date']).dt.days

In [31]:
df_merged.head(1)

Unnamed: 0,video_id,channelTitle,title,viewCount,likeCount,commentCount,comments,title_clean,date,day,duration(s),year,month
0,KOEfDvr4DcQ,MrBeast,"Face Your Biggest Fear To Win $800,000",101868832,4025540,151759,['The new Feastables bars will take some time ...,face big fear win money,2024-02-10,Saturday,1323,2024,2


# 6) Apply Semantics Analysis for comments

In [32]:
df_merged['comments'][0][200:1000]

'him just made it to where you wouldn\'t lose 800k", \'Дай ему 3шанс\', \'Mack need onemore changees\', \'IN frui si tray de in fruy sutayde de eee\', \'Fede 🎉 wow tu voz 🎉 👏\', \'Mack should at least get a consolation prize. He faced every fear.\', \'Please\', \'Please\', \'Consulta que marca es el microfono que se uso cuando se rompio la galleta... lo quiero..!!! jajajaja\', \'shouldve taken the 300k. greed costed him that\', \'Espero no se me suicide por tremendo fracaso.\', \'انا احب مستر بيست\', \'Super con la voz de Fedeeeee❤\', \'Give him another chance!\', \'Me encanta la vos de fede\', \'Somebody give Mack another cookie 🤧\', \'Biggest you tuber\', \'Nyc\', "It\'s really very terrible 😮", \'Carf\', \'Bring him back please\', \'can howtobasic do this challenge\', \'Bros is organising a lil tomodachi game ☠️\', \'Give him one m'

### The text is filled with unnecessary elements, including non-alphabetic characters, symbols, emojis, punctuation, and stop words.

## To clean the data:
## Removing non-alphabetic words, white spaces, non-English words

In [33]:
df_merged['clean'] = df_merged['comments'].apply(clean_data)
df_merged['clean'] = df_merged['clean'].apply(remove_blank)
df_merged['clean'] = df_merged['clean'].apply(lambda x: [remove_non_english(sentence) for sentence in x]) # It will take a while

In [34]:
df_merged['clean'] = df_merged['clean'].apply(lambda x: [item for item in x if (isinstance(item, list) and len(item) > 1) or (isinstance(item, str) and item.strip() != '')])

In [35]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
df_merged['senti'] = df_merged['clean'].apply(lambda x: [sia.polarity_scores(sentence) for sentence in x])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [36]:
# Create a new column 'positive_senti'
df_merged['positive_senti'] = df_merged['senti'].apply(lambda x: average_sentiments(x, 'pos'))
df_merged['negative_senti'] = df_merged['senti'].apply(lambda x: average_sentiments(x, 'neg'))
df_merged['neutral_senti'] = df_merged['senti'].apply(lambda x: average_sentiments(x, 'neu'))

In [37]:
df_merged.head(5)

Unnamed: 0,video_id,channelTitle,title,viewCount,likeCount,commentCount,comments,title_clean,date,day,duration(s),year,month,clean,senti,positive_senti,negative_senti,neutral_senti
0,KOEfDvr4DcQ,MrBeast,"Face Your Biggest Fear To Win $800,000",101868832,4025540,151759,['The new Feastables bars will take some time ...,face big fear win money,2024-02-10,Saturday,1323,2024,2,[The new Feastables bars will take some time t...,"[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun...",0.175026,0.070807,0.71992
1,krsBRQbOPQ4,MrBeast,"$1 vs $250,000,000 Private Island!",132353095,4380220,97258,['I hope you all enjoy how much we’ve been lev...,money vs money private island,2024-01-27,Saturday,1019,2024,1,[I hope you all enjoy how much weve been level...,"[{'neg': 0.0, 'neu': 0.497, 'pos': 0.503, 'com...",0.185993,0.031824,0.727717
2,7ESeQBeikKs,MrBeast,"Protect $500,000 Keep It!",110379739,4075544,75623,['The new Feastables branding and chocolate fo...,protect money keep,2024-01-13,Saturday,934,2024,1,[The new Feastables branding and chocolate for...,"[{'neg': 0.12, 'neu': 0.728, 'pos': 0.152, 'co...",0.187616,0.02809,0.70669
3,K_CbgLpvH9E,MrBeast,I Spent 7 Days In Solitary Confinement,125032838,4230264,80321,"['watch until the end for good luck', 'No cred...",spend number day solitary confinement,2023-12-30,Saturday,1216,2023,12,"[watch until the end for good luck, No credits...","[{'neg': 0.0, 'neu': 0.459, 'pos': 0.541, 'com...",0.165002,0.033459,0.733439
4,lOKASgtr6kU,MrBeast,I Rescued 100 Abandoned Dogs!,124742129,4924317,94035,['Every family who adopted a dog was fully vet...,rescue number abandon dog,2023-12-23,Saturday,903,2023,12,[Every family who adopted a dog was fully vett...,"[{'neg': 0.0, 'neu': 0.809, 'pos': 0.191, 'com...",0.213707,0.030198,0.704984


In [38]:
df_merged.to_csv('Dataset/mr_beast_dataset.csv', index = False)