# Sentiment analysis on Youtube

In [7]:
!pip install vaderSentiment langdetect emoji nltk vaderSentiment

StatementMeta(spark0624, 9, 8, Finished, Available)



In [8]:
from pyspark.sql.functions import udf,monotonically_increasing_id
from pyspark.sql.types import StringType, FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql import functions as f  
import requests
import pandas as pd 
import datetime
import emoji
import nltk
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from notebookutils import mssparkutils

StatementMeta(spark0624, 9, 9, Finished, Available)

In [9]:
nltk.download('punkt')
nltk.download('stopwords')

StatementMeta(spark0624, 9, 10, Finished, Available)

[nltk_data] Downloading package punkt to /home/trusted-service-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/trusted-service-
[nltk_data]     user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# read file 
datalake_nm = 'datalake'+mssparkutils.env.getWorkspaceName()[7:] # get datalake name 
file_path ='abfss://files@{0}.dfs.core.windows.net/synapse/workspaces/data/youtube/youtube_cms'.format(datalake_nm)
Youtube_df = spark.read.parquet(file_path)

StatementMeta(spark0624, 9, 11, Finished, Available)

In [11]:
# Filter on words Count
Youtube_df_v1 = (
    Youtube_df.withColumn('wordCount', f.size(f.split(f.col('comment'), ' ')))
    .filter(f.col('wordCount')>5)
    .withColumn("id", monotonically_increasing_id())
    .select('Date','id','Comment','symbol','VideoID')
    ).toPandas()

StatementMeta(spark0624, 9, 12, Finished, Available)

In [12]:
# remove non english word
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False
    
# change text into lowercase
def to_lowercase(text):
    return text.lower()

# remove punctuation
def rm_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def tokenise(text):
    return word_tokenize(text)

# remove stop words 
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# rejoin tokens
def rejoin_tokens(tokens):
    return ' '.join(tokens)

# main function for preprocess_text 
def preprocess_text(text):
    text = to_lowercase(text)
    text = rm_punctuation(text)
    tokens = tokenise(text)
    tokens = remove_stopwords(tokens)
    return rejoin_tokens(tokens)

def is_string(text):
    return isinstance(text, str)

StatementMeta(spark0624, 9, 13, Finished, Available)

## 1. Data preprocessing

In [13]:
# Remove content is not english
Youtube_df_v1['is_english'] = Youtube_df_v1['Comment'].apply(is_english)
Youtube_df_v2 = Youtube_df_v1[Youtube_df_v1['is_english']]

# data preapration
Youtube_df_v2['preprocessed_text'] = Youtube_df_v2['Comment'].apply(preprocess_text)

# remove content is not string
Youtube_df_v2['is_string']=Youtube_df_v2['preprocessed_text'].apply(is_string)
Youtube_df_v3 = Youtube_df_v2[Youtube_df_v2['is_string']]

StatementMeta(spark0624, 9, 14, Finished, Available)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Youtube_df_v2['preprocessed_text'] = Youtube_df_v2['Comment'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Youtube_df_v2['is_string']=Youtube_df_v2['preprocessed_text'].apply(is_string)


# 2. Sentiment analysis with vader

In [14]:
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return scores['compound']

def categorize_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

StatementMeta(spark0624, 9, 15, Finished, Available)

In [15]:
Youtube_df_v3['sentiment'] = Youtube_df_v3['preprocessed_text'].apply(analyze_sentiment)
Youtube_df_v3['sentiment_category'] = Youtube_df_v3['sentiment'].apply(categorize_sentiment)
Youtube_df_v4 = Youtube_df_v3[['id','sentiment','sentiment_category']]
final_df = pd.merge(Youtube_df_v1,Youtube_df_v4, left_on='id', right_on='id', how='left')
result = final_df[['Date','symbol','Comment','sentiment','sentiment_category']]

StatementMeta(spark0624, 9, 16, Finished, Available)

In [16]:
spdf = spark.createDataFrame(result)
datalake_nm = 'datalake'+mssparkutils.env.getWorkspaceName()[7:] # get datalake name 
file_path ='abfss://files@{0}.dfs.core.windows.net/synapse/workspaces/data/youtube/youtube_sentiments'.format(datalake_nm)
spdf.write.parquet(file_path,mode='overwrite')

StatementMeta(spark0624, 9, 17, Finished, Available)