# Sentiment analysis on reddit comments

In [8]:
!pip install vaderSentiment langdetect emoji nltk vaderSentiment

In [9]:
from pyspark.sql.functions import udf,monotonically_increasing_id
from pyspark.sql.types import StringType, FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql import functions as f  
import requests
import pandas as pd 
import datetime
import emoji
import nltk
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from notebookutils import mssparkutils

In [10]:
nltk.download('punkt')
nltk.download('stopwords')

In [11]:
# read file from datalake
datalake_nm = 'datalake'+mssparkutils.env.getWorkspaceName()[7:] # get datalake name 
file_path ='abfss://files@{0}.dfs.core.windows.net/synapse/workspaces/data/reddit_cms'.format(datalake_nm)
reddit_df = spark.read.parquet(file_path )

## 1. Data preprocessing

In [12]:
# remove non english word
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False
    
# change text into lowercase
def to_lowercase(text):
    return text.lower()

# remove punctuation
def rm_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def tokenise(text):
    return word_tokenize(text)

# remove stop words 
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# rejoin tokens
def rejoin_tokens(tokens):
    return ' '.join(tokens)

# main function for preprocess_text 
def preprocess_text(text):
    text = to_lowercase(text)
    text = rm_punctuation(text)
    tokens = tokenise(text)
    tokens = remove_stopwords(tokens)
    return rejoin_tokens(tokens)

def is_string(text):
    return isinstance(text, str)

In [13]:
# Filter On words counts  
reddit_df_v1 = (
    reddit_df.withColumn('wordCount', f.size(f.split(f.col('comment'), ' ')))
    .filter(f.col('wordCount')>5)
    .withColumn("id", monotonically_increasing_id())
    .select('created_utc','id','comment','symbol')
    ).toPandas()

# Remove content is not english
reddit_df_v1['is_english'] = reddit_df_v1['comment'].apply(is_english)
reddit_df_v2 = reddit_df_v1[reddit_df_v1['is_english']]

# data preapration
reddit_df_v2['preprocessed_text'] = reddit_df_v2['comment'].apply(preprocess_text)

# remove content is not string

reddit_df_v2['is_string']=reddit_df_v2['preprocessed_text'].apply(is_string)
reddit_df_v3 = reddit_df_v2[reddit_df_v2['is_string']]

# 2. Sentiment analysis with vader

In [14]:
def analyze_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return scores['compound']

def categorize_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [15]:
# get sentiments
reddit_df_v3['sentiment'] = reddit_df_v3['preprocessed_text'].apply(analyze_sentiment)
reddit_df_v3['sentiment_category'] = reddit_df_v3['sentiment'].apply(categorize_sentiment)

reddit_df_v4 = reddit_df_v3[['id','created_utc','sentiment','sentiment_category']]

final_df = pd.merge(reddit_df_v1,reddit_df_v4, left_on='id', right_on='id', how='left')
result = final_df[['created_utc_x','symbol','comment','sentiment','sentiment_category']]

In [16]:
# write data into datalake
spdf = spark.createDataFrame(result)
datalake_nm = 'datalake'+mssparkutils.env.getWorkspaceName()[7:] # get datalake name 
file_path ='abfss://files@{0}.dfs.core.windows.net/synapse/workspaces/data/reddit_sentiments'.format(datalake_nm)
spdf.write.parquet(file_path,mode='overwrite')