# Youtube sentiment Analysis 

### Importing necessary libraries 

In [1]:
import pandas as pd # Pandas for analyzing, cleaning, exploring and manipulating the data
import numpy as np # Numpy to work with arrays
import matplotlib.pyplot as plt # Data visualization library 
import seaborn as sns # advance data visualization
import pandoc

import warnings 
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
data = pd.read_csv(r'UScomments.csv', error_bad_lines = False) 

Skipping line 41589: expected 4 fields, saw 11
Skipping line 51628: expected 4 fields, saw 7
Skipping line 114465: expected 4 fields, saw 5

Skipping line 142496: expected 4 fields, saw 8
Skipping line 189732: expected 4 fields, saw 6
Skipping line 245218: expected 4 fields, saw 7

Skipping line 388430: expected 4 fields, saw 5



- r/R used to create raw string The r before the string denotes a raw string literal in Python. This means that backslashes within the string are treated as literal backslashes, and not as escape characters. In this context, it ensures that the file path is interpreted correctly, though in this specific case, it isn't strictly necessary since there are no backslashes in the string.

- error_bad_line is used to Handle Errors. If the file contains rows that do not conform to the expected structure (e.g., a row has too many or too few columns), those rows will be skipped instead of causing the function to throw an error.

In [3]:
data.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


- Data.head() prints top 5 rows of the DataFrame and name_of_df.tail() prints bottom 5 values of the rows you can change the number of rows to be seen by adding the number between the function for eg - name_of_df.head(10) will show use top 10 rows.

In [4]:
data.isnull().sum()

video_id         0
comment_text    25
likes            0
replies          0
dtype: int64

- name_of_df.isnull().sum() is used to check the null values in the data frame in each coulmn 

In [5]:
data.dropna(inplace=True)

- name_of_df.dropna(inplace = True) this function is used to drop the null values. 'inplace = True' is used for permanent change  

In [6]:
data.isnull().sum()

video_id        0
comment_text    0
likes           0
replies         0
dtype: int64

In [7]:
data.shape

(691375, 4)

- name_of_df.shape -> is used to check number of rows and columns in a DataFrame

## Data Processing

In [8]:
# !pip install textblob

In [9]:
from textblob import TextBlob

In [10]:
data.head(6)

Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0
5,XpVt6Z1Gjjo,#1 on trending AYYEEEEE,3,0


In [11]:
TextBlob("Logan Paul it's yo big day ‼️‼️‼️").sentiment.polarity

0.0

In [None]:
polarity = []

for comment in data['comment_text']:
    try:
        polarity.append(TextBlob(comment).sentiment.polarity)
    except:
        polarity.append(0)
        

In [None]:
len(polarity)

In [None]:
data['polarity'] = polarity

In [None]:
data.head()

In [None]:
print(data['polarity'].unique())

- THe above line of codes is used to give a polarity to a sentences i.e sentiment to a sentence -1 polarity is for negative sentiment and 1 is for positive sentiment. To gice polarity to each sentence we used Textbblob library and its inbuilt functions 

## Word Cloud

In [None]:
# !pip install wordcloud 

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
len(set(STOPWORDS))

- removing stops words. Stop words are common words that are often filtered out before processing textual data in various natural language processing (NLP) tasks. These words are considered to be of little value in terms of the overall meaning and context of the text. Common stop words include articles, prepositions, conjunctions, and pronouns such as "a," "an," "the," "and," "or," "but," "is," "in," "on," "at," etc.

In [None]:
type(data['comment_text'])

#### Wordcloud for positive words.

In [None]:
positive_comments = data[data['polarity'] == 1] 

In [None]:
total_comments_positive = ' '.join(positive_comments['comment_text'])

In [None]:
# total_comments_positive

In [None]:
wordcloud = WordCloud(stopwords = set(STOPWORDS)).generate(total_comments_positive)

In [None]:
plt.imshow(wordcloud)
plt.axis('off')

### Wordcloud for negative comments

In [None]:
negative_comments = data[data['polarity']== -1]

In [None]:
total_comments_negative = ' '.join(negative_comments['comment_text'])

In [None]:
wordcloud2 =  WordCloud(stopwords = set(STOPWORDS)).generate(total_comments_negative)

In [None]:
plt.imshow(wordcloud2)
plt.axis('off')

## Emoji Analysis

In [None]:
# !pip install emoji==2.2.0

In [None]:
import emoji

In [None]:
emoji.__version__

In [None]:
data['comment_text'].head()

In [None]:
comment = ' trending 😉'

In [None]:
[char for char in comment if char in emoji.EMOJI_DATA]

In [None]:
emoji_list = []
    
for comment in data['comment_text'].dropna():
    for char in comment:
        if char in emoji.EMOJI_DATA:
            emoji_list.append(char)


In [None]:
emoji_list[0:10]

In [None]:
from collections import Counter

In [None]:
Counter(emoji_list).most_common(10)

In [None]:
frequency = [Counter(emoji_list).most_common(10)[i][1] for i in range(10)]

In [None]:
emojis = [Counter(emoji_list).most_common(10)[i][0] for i in range(10)]

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot

In [None]:
trace =go.Bar(x=emojis, y=frequency)

In [None]:
iplot([trace])

## Youtube Comments

In [None]:
import os

- The 'os' module in Python provides a way of using operating system dependent functionality like reading or writing to the file system, handling directories, executing system commands, and more. It is part of the standard library, so it comes with Python and does not need to be installed separately.

In [None]:
files = os.listdir(r'C:\Users\Atharva\Desktop\aaaaaaa\Study\Data Analysis Course\additional_data')

- The os.listdir function in Python is used to list all files and directories in a specified directory.

In [None]:
files

In [None]:
files_csv = [file for file in files if '.csv' in file]

In [None]:
files_csv

In [None]:
full_df = pd.DataFrame() # Creating a empty data drame to concat all the dataframes
path = r'C:\Users\Atharva\Desktop\aaaaaaa\Study\Data Analysis Course\additional_data'

for file in files_csv:
    current_df = pd.read_csv(path+'/'+file, encoding='iso-8859-1', error_bad_lines = False)
    
    full_df = pd.concat([full_df , current_df], ignore_index = True)

- 'ignore_index = True' ignore the existing row indices of the DataFrames and to reindex the resulting DataFrame. When ignore_index=True, the resulting DataFrame will have a new integer index that ranges from 0 to n-1, where n is the total number of rows in the concatenated DataFrame.

In [None]:
full_df.shape

In [None]:
full_df.info()

- The df.info() method in pandas is used to get a concise summary of a DataFrame. This method provides important details about the DataFrame, including the index dtype and column dtypes, non-null values, and memory usage. It is particularly useful for quickly understanding the structure and quality of your data.

In [None]:
full_df.describe()

- The df.describe() function in pandas is used to generate descriptive statistics of a DataFrame. It provides a summary of the central tendency, dispersion, and shape of a dataset's distribution, excluding NaN values. This function is particularly useful for quickly getting an overview of numeric data in a DataFrame.

In [None]:
full_df.duplicated().sum()

df.duplicated() function is used to check duplicated values in the data frames

In [None]:
full_df[full_df.duplicated()].shape

In [None]:
full_df = full_df.drop_duplicates()

df.drop_duplicated() is used to remove the duplicated values from the data frame

In [None]:
full_df.shape

In [None]:
path = r'C:\Users\Atharva\Desktop\aaaaaaa\Study\Data Analysis Course\additional_data'
full_df.to_csv(f'{path}\youtube_sample.csv', index = False)

df.to_csv exports the created dataframe to our desired path in csv format

In [None]:
full_df.to_json(f'{path}\youtube_sample.json')

df.to_json exports the created dataframe to our desired path in json format

In [None]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine(f'sqlite:///{path}\YTdc.sqlite')

In [None]:
# full_df[0:1000].to_sql('Users', con = engine, if_exists = 'append')

In [None]:
full_df = pd.read_csv(r'additional_data\youtube_sample.csv')

In [None]:
full_df.head()

### Below we will extract category title using data manipluation 

In [None]:
full_df['category_id'].unique() 

As we can see above there are videos with category id but without thier names and its hard to understand the category of the video by thier ids.

In [None]:
json_df = pd.read_json(fr'{path}\US_category_id.json') 

in the above cell we took one jsaon dataframe to extract category name from the dictionary you can take any other category data frame present in the data files

In [None]:
json_df

In [None]:
json_df['items'][1]

- we can see that the category name is in the items column in our dataframe and is in dictionary. to access the dictonary we will have to manipulate it and extract our desired category name

In [None]:
cat_dict = {} # creating dictionary to store category name and id

for item in json_df['items'].values:
    cat_dict[ int(item["id"])] = item['snippet']['title']
    
cat_dict

In [None]:
full_df['category_name'] = full_df['category_id'].map(cat_dict) # Creating a new column and mapping the title to its desired id

In [None]:
full_df[['category_id','category_name']].head()

Now its easier for us to understand the type/category of the video and we successfully were able to extract the category name from the table using data manipulation 

### Data visualization

In [None]:
full_df['likes'].describe()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='category_name', y='likes', data = full_df)
plt.xticks(rotation='vertical')

In [None]:
full_df['like_rate'] =  (full_df['likes']/full_df['views']) * 100 
full_df['dislike_rate'] = (full_df['dislikes']/full_df['views']) * 100 
full_df['comment_count_rate'] = (full_df['comment_count']/full_df['views']) * 100  

In [None]:
full_df.columns

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='category_name', y='like_rate', data = full_df)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.regplot(x='views',y='likes', data= full_df)

In [None]:
full_df.columns

In [None]:
full_df[['views', 'likes', 'dislikes']].corr()

In [None]:
sns.heatmap(full_df[['views', 'likes', 'dislikes']].corr(), annot = True)

In [None]:
full_df.head(6)

In [None]:
full_df['channel_title'].value_counts()

In [None]:
top_20_channels = full_df.groupby(['channel_title']).size().sort_values(ascending = False).reset_index().head(20)

In [None]:
top_20_channels

In [None]:
top_20_channels.rename(columns = {0:'total_videos'}, inplace = True)

In [None]:
top_20_channels

In [None]:
import plotly.express as px

In [None]:
px.bar(data_frame=top_20_channels[0:20], x = 'channel_title', y='total_videos')

### To check if adding punctuation in the title helps to increase views or likes

In [None]:
full_df['title'][0]

In [None]:
import string 

In [None]:
string.punctuation

In [None]:
len([char for char in full_df['title'][0] if char in string.punctuation])

In [None]:
def puncuation_count(text):
    return len([char for char in text if char in string.punctuation])

In [None]:
full_df['punctuation_count'] = full_df['title'].apply(puncuation_count)

In [None]:
# full_df.drop(columns = 'punctuation_count', inplace = True)

In [None]:
full_df['punctuation_count']

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='punctuation_count', y='views', data = full_df[0:1000])
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='punctuation_count', y='likes', data = full_df[0:1000])
plt.show()