In [1]:
import re
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,col
from pyspark.sql.types import *
#Connect to the cluster
# New API
spark = SparkSession\
        .builder\
        .master("spark://192.168.2.207:7077") \
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .appName("pa_test")\
        .getOrCreate()

# Old API (RDD)
sc = spark.sparkContext

sc.setLogLevel("INFO")

# Research if there is a correlation between comment length - positive/negative words - score

## Load sentiment files

In [2]:
nlines = sc.textFile("hdfs://192.168.2.207:9000/user/ubuntu/negative-words.txt")
plines = sc.textFile("hdfs://192.168.2.207:9000/user/ubuntu/positive-words.txt")

In [3]:
nlines.take(10)

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted']

## Prepare negative and positive word data

In [4]:
def compile_regexp(word_list):
    re_string = "[\s\W]("
    for word in word_list:
        re_string += (re.escape(word) + "|")
    re_string = re_string[0:-1] + ")[\s\W]"
    return re.compile(re_string, re.IGNORECASE)

negative = compile_regexp(nlines.collect())
sc.broadcast(negative)
positive = compile_regexp(plines.collect())
sc.broadcast(positive)


#df = spark.read.json("hdfs://192.168.2.207:9000/user/ubuntu/RC_2010-*")
#data_clean = df.select("subreddit", "body", "score", "controversiality")

<pyspark.broadcast.Broadcast at 0x7f413051b128>

In [5]:
print(type(negative))

<class '_sre.SRE_Pattern'>


## Load Reddit comments

In [49]:
df_test = spark.read.json("hdfs://192.168.2.207:9000/user/ubuntu/sample_data.json")
df_correlate_score_length = df_test.select("subreddit", "body", "score")

In [50]:
print(type(df_test))

<class 'pyspark.sql.dataframe.DataFrame'>


In [51]:
df_correlate_score_length.show()

+------------------+--------------------+-----+
|         subreddit|                body|score|
+------------------+--------------------+-----+
|          sandiego|            A quarry|    3|
|              RWBY|[Salutations! I'm...|    3|
|          baseball|I got into baseba...|    2|
|         2007scape|        FUCKING TORY|   18|
| mildlyinteresting|I see a water dra...|    1|
|            Cubers|Wait. The Michiga...|    1|
|         teenagers|              ye fam|    2|
|       4chan4trump|143417804| &gt; U...|    1|
|               CFB|That is some chic...|    2|
|        rugbyunion|Does he even know...|    1|
|               CFB|            Tequila.|    2|
|         EchoArena|your heart beats ...|    1|
|               HFY|&gt; Subscribe: /...|    1|
|        The_Donald|you're really ign...|    2|
|        CrazyIdeas|lets see how deep...|    1|
|             NBA2k|You are arguing t...|    2|
|           opiates|I'm thinking abou...|    2|
|ImagesOfNewZealand|[Original post](h...

In [52]:
df_correlate_score_length.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- body: string (nullable = true)
 |-- score: long (nullable = true)



## Filter to exclude empty/missing comment or score

In [53]:
#import org.apache.spark.sql.SparkSession
df_correlate_score_length = df_correlate_score_length.withColumn("score", df_correlate_score_length['score'].cast('int'))
df_clean = df_correlate_score_length.filter( (df_correlate_score_length.body != "[deleted]"))

## Add 3 new Cols, # words, #of negative words, # of positive words

In [54]:
def count_negative(comment):
    return len(negative.findall(comment))

def count_positive(comment):
    return len(positive.findall(comment))

def count_words(comment):
    return len(comment.split())

udf_count_negative = udf(count_negative, IntegerType())
udf_count_positive = udf(count_positive, IntegerType())
udf_count_words = udf(count_words, IntegerType())


word_count = df_clean.withColumn('words', udf_count_words('body'))
negative_add = word_count.withColumn('negativeWords', udf_count_negative('body'))
neg_pos_df = negative_add.withColumn('positiveWords', udf_count_positive('body'))

In [None]:
def match_negative(comment, wc):
    return len(negative.findall(comment))/wc

def match_positive(comment, wc):
    return len(positive.findall(comment))/wc

def count_words(comment):
    return len(comment.split())

udf_match_negative = udf(match_negative, DoubleType())
udf_match_positive = udf(match_positive, DoubleType())
udf_count_words = udf(count_words, IntegerType())


In [55]:
neg_pos_df.show()

+------------------+--------------------+-----+-----+-------------+-------------+
|         subreddit|                body|score|words|negativeWords|positiveWords|
+------------------+--------------------+-----+-----+-------------+-------------+
|          sandiego|            A quarry|    3|    2|            0|            0|
|              RWBY|[Salutations! I'm...|    3|    7|            0|            0|
|          baseball|I got into baseba...|    2|   27|            1|            0|
|         2007scape|        FUCKING TORY|   18|    2|            0|            0|
| mildlyinteresting|I see a water dra...|    1|    5|            0|            0|
|            Cubers|Wait. The Michiga...|    1|   20|            0|            1|
|         teenagers|              ye fam|    2|    2|            0|            0|
|       4chan4trump|143417804| &gt; U...|    1|   29|            0|            0|
|               CFB|That is some chic...|    2|    9|            1|            0|
|        rugbyun

In [None]:
#Group By subreddit

#find "happiest" subreddit of the day 

In [13]:
print(type(neg_pos_df))

<class 'pyspark.sql.dataframe.DataFrame'>


## Research for correlation between score and #positive or #negative words

* Assume normal distribution in length of comments
* This way we can create intervals to split the data and search for patterns if any
* For example if a comment falls near the average length of comments is it more likely to contain more negative or positive words?

## Another approach

* Assume normal dist in score among all posts
* Create intervals 
* 

In [35]:
#neg_pos_df.printSchema()
#neg_pos_df.select('score', 'words', 'negativeWords','positiveWords').show()
df_cl1 = neg_pos_df.select("score", "words", "negativeWords", "positiveWords", "subreddit")

In [36]:
df_cl1.show()

+-----+-----+-------------+-------------+
|score|words|negativeWords|positiveWords|
+-----+-----+-------------+-------------+
|    2|   12|            0|            1|
|    1|   11|            1|            0|
|    1|   25|            0|            1|
|    2|   45|            1|            2|
|    2|    5|            1|            0|
|    0|   96|            6|            4|
|   28|   16|            1|            0|
|    0|    4|            0|            0|
|    0|   21|            1|            2|
|    1|  345|           11|           13|
|    2|   87|            6|            2|
|    3|   42|            1|            0|
|    1|   88|            0|            4|
|    2|   16|            0|            1|
|   21|   20|            0|            1|
|    1|  149|            1|            6|
|    8|    1|            0|            0|
|    4|   23|            1|            1|
|    1|   25|            0|            0|
|    1|    9|            0|            1|
+-----+-----+-------------+-------

In [46]:
max_len = df_cl1.agg({"words": "max"}).first()[0]

In [47]:
max_score

1823