# YouTube NLP

## Ingesting Comments from the YouTube Video

In [None]:
api_key = ''

In [None]:
from googleapiclient.discovery import build

# The string after v=? in the url
video_id = "dQw4w9WgXcQ"

# creating youtube resource object
youtube = build('youtube', 'v3',
developerKey=api_key)

# retrieve youtube video results
video_response=youtube.commentThreads().list(
  part='snippet',
  maxResults=1000,
  order='relevance',
  videoId=video_id
).execute()


## Data Preprocessing

### Data Wrangling

#### Assumptions


*   Picking only the comment made by the original author
  *   Comment Threads on the Original Authors comment could be diverging from the video
  *   Limiting Response to about 10 for testing

*   Using the 'textOriginal' instead of 'textDisplay' for easier cleanup of the comment


*   Analyzing only English Comments 
  *   Storing the number of non English comments for metrics








In [None]:
# empty list for storing reply
comments = []

# extracting required info from each result object
for item in video_response['items']:

  # Extracting comments
  comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
  # Extracting author
  author = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
  # Extracting published time
  published_at = item['snippet']['topLevelComment']['snippet']['publishedAt']
  # Extracting likes
  like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
  # Extracting total replies to the comment
  reply_count = item['snippet']['totalReplyCount']

  comments.append([author, comment, published_at, like_count, reply_count])

In [None]:
comments[0:5]

[['Rick Astley',
  '1 BILLION views for Never Gonna Give You Up!\xa0 Amazing, crazy, wonderful! Rick ♥️',
  '2021-07-28T21:00:32Z',
  1165839,
  497],
 ['Amelia Honey',
  'This has been the most amazing rick roll I have ever gotten. Thank you discord',
  '2022-05-13T22:16:45Z',
  720,
  39],
 ['XIÁN PENA PUZA',
  'Naah... temardo en verdad 😎🤙\n21047218 personas ya lo han escuchado,\nespero que te haya gustado 😉',
  '2023-01-01T14:10:35Z',
  62,
  9],
 ['Rafael Perez',
  'Naah... temardo en verdad 😎🤙\n6025619 personas ya lo han escuchado,\nespero que te haya gustado 😉',
  '2022-12-31T15:25:35Z',
  395,
  11],
 ['Natan Kozłowski',
  'Even when’s it’s 2023 thousands of people still listen to this song everyday,it never gets old❤',
  '2022-12-30T18:57:47Z',
  219,
  19]]

### NLP

#### Install Libraries

In [None]:
pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install pycountry

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install clean-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### Text Processing

In [None]:
import pandas as pd
df = pd.DataFrame({'Author': [i[0] for i in comments], 'Comment': [i[1] for i in comments], 'Timestamp': [i[2] for i in comments],
                  'Likes': [i[3] for i in comments], 'TotalReplies': [i[4] for i in comments]})

In [None]:
df.head()

Unnamed: 0,Author,Comment,Timestamp,Likes,TotalReplies
0,Rick Astley,1 BILLION views for Never Gonna Give You Up! ...,2021-07-28T21:00:32Z,1165839,497
1,Amelia Honey,This has been the most amazing rick roll I hav...,2022-05-13T22:16:45Z,720,39
2,XIÁN PENA PUZA,Naah... temardo en verdad 😎🤙\n21047218 persona...,2023-01-01T14:10:35Z,62,9
3,Rafael Perez,Naah... temardo en verdad 😎🤙\n6025619 personas...,2022-12-31T15:25:35Z,395,11
4,Natan Kozłowski,Even when’s it’s 2023 thousands of people stil...,2022-12-30T18:57:47Z,219,19


In [None]:
from cleantext import clean

# Remove extra spaces and make them lower case. Replace special emojis
clean_df = df
clean_df['Comment'] = clean_df['Comment'].apply(lambda x: x.strip().lower().replace('xd','').replace('<3',''))

# Clean text from line breaks, unicodes, emojis and punctuations
clean_df['Comment'] = clean_df['Comment'].apply(lambda x: clean(x, no_emoji=True, no_punct=True, no_line_breaks=True, fix_unicode=True))
clean_df.head()

Unnamed: 0,Author,Comment,Timestamp,Likes,TotalReplies
0,Rick Astley,1 billion views for never gonna give you up am...,2021-07-28T21:00:32Z,1165839,497
1,Amelia Honey,this has been the most amazing rick roll i hav...,2022-05-13T22:16:45Z,720,39
2,XIÁN PENA PUZA,naah temardo en verdad 21047218 personas ya lo...,2023-01-01T14:10:35Z,62,9
3,Rafael Perez,naah temardo en verdad 6025619 personas ya lo ...,2022-12-31T15:25:35Z,395,11
4,Natan Kozłowski,even whens its 2023 thousands of people still ...,2022-12-30T18:57:47Z,219,19


In [None]:
from langdetect import detect
clean_df['Language'] = clean_df['Comment'].apply(lambda x: detect(x))
clean_df.head()

Unnamed: 0,Author,Comment,Timestamp,Likes,TotalReplies,Language
0,Rick Astley,1 billion views for never gonna give you up am...,2021-07-28T21:00:32Z,1165839,497,en
1,Amelia Honey,this has been the most amazing rick roll i hav...,2022-05-13T22:16:45Z,720,39,en
2,XIÁN PENA PUZA,naah temardo en verdad 21047218 personas ya lo...,2023-01-01T14:10:35Z,62,9,es
3,Rafael Perez,naah temardo en verdad 6025619 personas ya lo ...,2022-12-31T15:25:35Z,395,11,es
4,Natan Kozłowski,even whens its 2023 thousands of people still ...,2022-12-30T18:57:47Z,219,19,en


In [None]:
import pycountry
# Convert ISO country codes to Languages
clean_df['Language'] = clean_df['Language'].apply(lambda x: pycountry.languages.get(alpha_2=x).name)
clean_df.head()

Unnamed: 0,Author,Comment,Timestamp,Likes,TotalReplies,Language
0,Rick Astley,1 billion views for never gonna give you up am...,2021-07-28T21:00:32Z,1165839,497,English
1,Amelia Honey,this has been the most amazing rick roll i hav...,2022-05-13T22:16:45Z,720,39,English
2,XIÁN PENA PUZA,naah temardo en verdad 21047218 personas ya lo...,2023-01-01T14:10:35Z,62,9,Spanish
3,Rafael Perez,naah temardo en verdad 6025619 personas ya lo ...,2022-12-31T15:25:35Z,395,11,Spanish
4,Natan Kozłowski,even whens its 2023 thousands of people still ...,2022-12-30T18:57:47Z,219,19,English
