In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark_session = SparkSession.builder \
    .master("spark://192.168.2.47:7077") \
    .appName("Group40_Project") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
    .config("spark.executor.memory", "5G") \
    .config("spark.cores.max", 12) \
    .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext
spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 22:41:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/19 22:41:38 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.


## Loading the data as a DataFrame

In [4]:
# Loading the dataset without the corrupted values
df = spark_session.read.option("mode", "DROPMALFORMED").json("hdfs://192.168.2.47:9000/data-project/reddit_50k.json")

                                                                                

In [5]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



In [6]:
df.show(10)

+---------------+--------------------+--------------------+-----------+-------+--------------------+---------------+------------+--------------------+-----------+--------------------+
|         author|                body|             content|content_len|     id|      normalizedBody|      subreddit|subreddit_id|             summary|summary_len|               title|
+---------------+--------------------+--------------------+-----------+-------+--------------------+---------------+------------+--------------------+-----------+--------------------+
|       SickSean|There was a inter...|There was a inter...|        201|c6jejpr|There was a inter...|        answers|    t5_2qkeh|Japan got lazy, a...|          9|                NULL|
|       debee1jp|A few months ago:...|A few months ago:...|         55|c6r75ss|A few months ago:...|leagueoflegends|    t5_2rfxx|       Brand is fine|          3|How to bring Bran...|
|EveningCrickets|I voted for Obama...|I voted for Obama...|        425|c6st5gy|I

In [7]:
df_prep = df.drop('body', 'content', 'id', 'subreddit_id', 'title', 'author', 'content_len', 'summary', 'summary_len')
#df_prep = df_prep.withColumn("content_len", F.col("content_len").cast("int"))
#df_prep = df_prep.withColumn("summary_len", F.col("summary_len").cast("int"))

In [8]:
df_prep.printSchema()

root
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)



In [9]:
df_prep.show(10)

+--------------------+---------------+
|      normalizedBody|      subreddit|
+--------------------+---------------+
|There was a inter...|        answers|
|A few months ago:...|leagueoflegends|
|I voted for Obama...|      AskReddit|
|> Overall music s...|     technology|
|"Contributing to ...|            mwo|
|I don't know what...|      AskReddit|
|I remember some t...|           pics|
|Are you being sar...|  relationships|
|      **Morsi's a...|      worldnews|
|I think there are...|      AskReddit|
+--------------------+---------------+
only showing top 10 rows



## Preprocessing texts

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and extra spaces
    text = re.sub(r'\W+', ' ', text)
    
    # Split text into words
    words = text.split()

    # Remove stopwords
    stop_words = stopwords.words('english')
    words = [w for w in words if w not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]

    out_text = ' '.join(words)

    return out_text

In [12]:
t = preprocess_text("This is a sample text to check which words are removed or not.")
t

'sample text check word removed'

In [32]:
df_top = df_prep.select('subreddit') \
                .groupBy('subreddit').count() \
                .sort('count', ascending=False) \
                .limit(25)

In [56]:
top25_subreddits = []
for i in range(25):
    top25_subreddits.append(df_top.collect()[i][0])

print(top25_subreddits)

['AskReddit', 'relationships', 'leagueoflegends', 'relationship_advice', 'trees', 'tifu', 'gaming', 'funny', 'atheism', 'AdviceAnimals', 'politics', 'pics', 'sex', 'todayilearned', 'WTF', 'Fitness', 'explainlikeimfive', 'worldnews', 'TwoXChromosomes', 'DotA2', 'videos', 'DestinyTheGame', 'IAmA', 'reddit.com', 'offmychest']


In [13]:
top10_subreddits = ['AskReddit', 'relationships', 'leagueoflegends', 'tifu', 'relationship_advice', 'trees', 'gaming', 'atheism', 'AdviceAnimals', 'funny']

In [57]:
df_top10_sr = df_prep.filter(df_prep['subreddit'].isin(top25_subreddits))

In [58]:
df_top10_sr.show(10)

+--------------------+---------------+
|      normalizedBody|      subreddit|
+--------------------+---------------+
|A few months ago:...|leagueoflegends|
|I voted for Obama...|      AskReddit|
|I don't know what...|      AskReddit|
|I remember some t...|           pics|
|Are you being sar...|  relationships|
|      **Morsi's a...|      worldnews|
|I think there are...|      AskReddit|
|The original Mari...|      AskReddit|
|Ate some acid wit...|      AskReddit|
|I had a teacher i...|            WTF|
+--------------------+---------------+
only showing top 10 rows



In [59]:
from pyspark.sql.functions import udf

In [60]:
preprocess_text_udf = udf(preprocess_text, "string")

In [61]:
df_top10_sr = df_top10_sr.withColumn("clean_text", preprocess_text_udf(df_top10_sr['normalizedBody']))

## Rule based sentiment analysis with VADER

In [62]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [63]:
sia = SentimentIntensityAnalyzer()

In [64]:
def sentiment_score(text):
    return sia.polarity_scores(text)['compound']

def sentiment_label(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

In [65]:
sentiment_score_udf = udf(sentiment_score, "float")
sentiment_label_udf = udf(sentiment_label, "string")

In [66]:
df_vader = df_top10_sr.withColumn("sentiment_score", sentiment_score_udf(df_top10_sr['clean_text']))
df_vader = df_vader.withColumn("sentiment_label", sentiment_label_udf(df_vader['sentiment_score']))

In [67]:
df_vader.show(2)

[Stage 260:>                                                        (0 + 1) / 1]

+--------------------+---------------+--------------------+---------------+---------------+
|      normalizedBody|      subreddit|          clean_text|sentiment_score|sentiment_label|
+--------------------+---------------+--------------------+---------------+---------------+
|A few months ago:...|leagueoflegends|month ago brand g...|         0.8779|       positive|
|I voted for Obama...|      AskReddit|voted obama 2008 ...|          0.956|       positive|
+--------------------+---------------+--------------------+---------------+---------------+
only showing top 2 rows



                                                                                

In [68]:
df_avg = df_vader.groupBy("subreddit") \
                 .agg(F.round(F.avg("sentiment_score"), 4).alias("avg_sentiment_score")) \
                 .limit(25)

In [69]:
df_avg = df_avg.withColumn("sentiment_label", sentiment_label_udf(df_avg['avg_sentiment_score']))

In [72]:
df_avg.show(25)



+-------------------+-------------------+---------------+
|          subreddit|avg_sentiment_score|sentiment_label|
+-------------------+-------------------+---------------+
|         offmychest|             0.2412|       positive|
|          AskReddit|             0.1281|       positive|
|             videos|             0.0718|       positive|
|              DotA2|             0.3839|       positive|
|      todayilearned|             0.1156|       positive|
|      AdviceAnimals|              0.074|       positive|
|     DestinyTheGame|              0.377|       positive|
|      relationships|             0.4971|       positive|
|               pics|             0.0994|       positive|
|            Fitness|             0.3721|       positive|
|         reddit.com|              0.242|       positive|
|          worldnews|            -0.1374|       negative|
|    TwoXChromosomes|             0.3153|       positive|
|           politics|             0.0821|       positive|
|             

25/03/19 23:47:12 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
25/03/19 23:47:12 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce