In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark_session = SparkSession.builder \
    .master("spark://192.168.2.47:7077") \
    .appName("Group40_Project") \
    .config("spark.dynamicAllocation.enabled", True) \
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True) \
    .config("spark.shuffle.service.enabled", False) \
    .config("spark.dynamicAllocation.executorIdleTimeout", "30s") \
    .config("spark.executor.memory", "5G") \
    .config("spark.cores.max", 12) \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 00:38:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/19 00:38:11 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more details.


## Loading the data as a DataFrame

In [3]:
# Loading the dataset without the corrupted values
df = spark_session.read.option("mode", "DROPMALFORMED").json("hdfs://192.168.2.47:9000/data-project/corpus-webis-tldr-17.json")

                                                                                

In [4]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



In [5]:
df.show(10)

+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|            author|                body|             content|content_len|     id|      normalizedBody|           subreddit|subreddit_id|             summary|summary_len|   title|
+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|  raysofdarkmatter|I think it should...|I think it should...|        178|c69al3r|I think it should...|                math|    t5_2qh0n|Shifting seasonal...|          8|    NULL|
|           Stork13|Art is about the ...|Art is about the ...|        148|c6a9nxd|Art is about the ...|               funny|    t5_2qh33|Personal opinions...|          4|    NULL|
|     Cloud_dreamer|Ask me what I thi...|Ask me what I thi...|         76|c6acx4l|Ask me what I thi.

In [6]:
df_prep = df.drop('body', 'content', 'id', 'subreddit_id', 'title', 'author', 'content_len', 'summary', 'summary_len')
#df_prep = df_prep.withColumn("content_len", F.col("content_len").cast("int"))
#df_prep = df_prep.withColumn("summary_len", F.col("summary_len").cast("int"))

In [7]:
df_prep.printSchema()

root
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)



In [8]:
df_prep.show(10)

+--------------------+--------------------+
|      normalizedBody|           subreddit|
+--------------------+--------------------+
|I think it should...|                math|
|Art is about the ...|               funny|
|Ask me what I thi...|         Borderlands|
|In Mechwarrior On...|            gamingpc|
|You are talking a...|              Diablo|
|All but one of my...|   RedditLaqueristas|
|I could give a sh...|               apple|
|So you're saying ...|               apple|
|I love this idea ...|RedditFilmsProduc...|
|Theres an entire ...|       AbandonedPorn|
+--------------------+--------------------+
only showing top 10 rows



## Preprocessing texts

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and extra spaces
    text = re.sub(r'\W+', ' ', text)
    
    # Lowercase and split text into words
    words = text.split()

    # Remove stopwords
    stop_words = stopwords.words('english')
    words = [w for w in words if w not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]

    out_text = ' '.join(words)

    return out_text

In [11]:
t = preprocess_text("This is a sample text to check which words are removed or not.")
t

'sample text check word removed'

In [12]:
df_prep.select('subreddit') \
       .groupBy('subreddit').count() \
       .sort('count', ascending=False) \
       .show(10)

                                                                                

+-------------------+------+
|          subreddit| count|
+-------------------+------+
|          AskReddit|589947|
|      relationships|352049|
|    leagueoflegends|109307|
|               tifu| 52219|
|relationship_advice| 50416|
|              trees| 47286|
|             gaming| 43851|
|            atheism| 43268|
|      AdviceAnimals| 40783|
|              funny| 40171|
+-------------------+------+
only showing top 10 rows



In [13]:
top10_subreddits = ['AskReddit', 'relationships', 'leagueoflegends', 'tifu', 'relationship_advice', 'trees', 'gaming', 'atheism', 'AdviceAnimals', 'funny']

In [14]:
df_top10_sr = df_prep.filter(df_prep['subreddit'].isin(top10_subreddits))

In [15]:
df_top10_sr.show(10)

+--------------------+---------------+
|      normalizedBody|      subreddit|
+--------------------+---------------+
|Art is about the ...|          funny|
|FALSE. Evidence: ...|        atheism|
|Yeah, but most fo...|      AskReddit|
|Didn't they lose ...|leagueoflegends|
|You probably won'...|      AskReddit|
|This picture does...|          trees|
|I want to say thi...|      AskReddit|
|I take a beta blo...|      AskReddit|
|About two months ...|      AskReddit|
|While I was under...|      AskReddit|
+--------------------+---------------+
only showing top 10 rows



In [16]:
from pyspark.sql.functions import udf

In [17]:
preprocess_text_udf = udf(preprocess_text, "string")

In [18]:
df_top10_sr = df_top10_sr.withColumn("clean_text", preprocess_text_udf(df_top10_sr['normalizedBody']))

In [19]:
df_top10_sr.show(10)

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+---------------+--------------------+
|      normalizedBody|      subreddit|          clean_text|
+--------------------+---------------+--------------------+
|Art is about the ...|          funny|art hardest thing...|
|FALSE. Evidence: ...|        atheism|false evidence wo...|
|Yeah, but most fo...|      AskReddit|yeah folk think a...|
|Didn't they lose ...|leagueoflegends|lose 6 game row c...|
|You probably won'...|      AskReddit|probably come as ...|
|This picture does...|          trees|picture follow we...|
|I want to say thi...|      AskReddit|want say two week...|
|I take a beta blo...|      AskReddit|take beta blocker...|
|About two months ...|      AskReddit|two month ago viv...|
|While I was under...|      AskReddit|spinal surgery ye...|
+--------------------+---------------+--------------------+
only showing top 10 rows



                                                                                

## Rule based sentiment analysis with VADER

In [20]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [21]:
sia = SentimentIntensityAnalyzer()

In [22]:
def sentiment_score(text):
    return sia.polarity_scores(text)['compound']

def sentiment_label(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'

In [23]:
sentiment_score_udf = udf(sentiment_score, "float")
sentiment_label_udf = udf(sentiment_label, "string")

In [24]:
df_vader = df_top10_sr.withColumn("sentiment_score", sentiment_score_udf(df_top10_sr['clean_text']))
df_vader = df_vader.withColumn("sentiment_label", sentiment_label_udf(df_vader['sentiment_score']))

In [25]:
df_vader.show(2)

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+---------+--------------------+---------------+---------------+
|      normalizedBody|subreddit|          clean_text|sentiment_score|sentiment_label|
+--------------------+---------+--------------------+---------------+---------------+
|Art is about the ...|    funny|art hardest thing...|          0.128|       positive|
|FALSE. Evidence: ...|  atheism|false evidence wo...|        -0.9607|       negative|
+--------------------+---------+--------------------+---------------+---------------+
only showing top 2 rows



                                                                                

In [26]:
df_vader.createTempView('vader')

In [27]:
spark_session.sql(
    """
    SELECT subreddit, ROUND(AVG(sentiment_score), 4)
    FROM vader
    GROUP BY subreddit
    LIMIT 10
    """
).show()



+-------------------+------------------------------+
|          subreddit|round(avg(sentiment_score), 4)|
+-------------------+------------------------------+
|          AskReddit|                        0.1232|
|      AdviceAnimals|                         0.098|
|      relationships|                         0.496|
|              trees|                        0.4134|
|relationship_advice|                        0.5632|
|    leagueoflegends|                        0.3344|
|            atheism|                        0.2702|
|             gaming|                        0.3343|
|               tifu|                        0.0718|
|              funny|                        0.1142|
+-------------------+------------------------------+



                                                                                

In [None]:
df_prep.createTempView("reddit")

In [10]:
spark_session.sql(
    """
    SELECT subreddit, ROUND(AVG(content_len), 2)
    FROM reddit
    GROUP BY subreddit
    """
).show(10)



+--------------------+--------------------------+
|           subreddit|round(avg(content_len), 2)|
+--------------------+--------------------------+
|               anime|                    235.91|
|          MensRights|                    289.63|
|              travel|                    263.82|
|londonfootballmeetup|                     284.5|
|               HPMOR|                    285.16|
|     youtubecomments|                    157.13|
|        SaltLakeCity|                    190.72|
| UnresolvedMysteries|                    391.01|
|          MLBTheShow|                    215.68|
|           metro2033|                    101.63|
+--------------------+--------------------------+
only showing top 10 rows



                                                                                