In [1]:
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from operator import add
import nltk
import json

In [None]:
#conf = SparkConf()
#conf.setMaster('spark://192.168.2.119:7077')
#conf.setAppName('Strong_Scalibility')
#spark_context = SparkContext(conf=conf)

In [3]:
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.119:7077") \
        .appName("TestApp")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .getOrCreate()

#        .config("spark.executor.cores",2)\
spark_context = spark_session.sparkContext
spark_context.setLogLevel("INFO")

In [None]:
#hdfs://localhost:9000
rdd = spark_context.newAPIHadoopFile(
    'hdfs://localhost:9000/user/ubuntu/RC_2010-06',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text'
)

In [None]:
rdd.take(1)

In [None]:
print(rdd.count())

In [None]:
gram_groups = dict()
gram_groups['Adjectives'] = ['JJ', 'JJR', 'JJS']
gram_groups['Nouns'] = ['NN', 'NNS', 'NNP', 'NNPS']
gram_groups['Pronouns'] = ['PRP', 'PRP$']
gram_groups['Adverbs'] = ['RB', 'RBR', 'RBS']
gram_groups['Verb'] = ['VB', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ']

In [None]:
def get_split_body(rdd):
    body = rdd\
    .map(lambda line: json.loads(line[1])['body'].strip().split())\

    return body

def check_gram_grp(tag_tuple):
    word, tag = tag_tuple
    for supergroup, subgroups in gram_groups.items():
                if tag in subgroups:
                    return supergroup
    return None

def categorize_words(split_rdd):
    cat_words = split_rdd.flatMap(lambda word: nltk.pos_tag(word))\
    .map(lambda tupl: (check_gram_grp(tupl), 1))\
    .filter(lambda x: x[0] != None)
   
    return cat_words

In [None]:
split = get_split_body(rdd)
categorized = categorize_words(split)
group_counts = categorized.reduceByKey(add).collect()
print(group_counts)

In [None]:
total = sum([gram_grp[1] for gram_grp in group_counts])
group_counts_norm = [(x[0], x[1]/total) for x in group_counts]
print(total)

In [None]:
# plot of the grammatical classes
import matplotlib.pyplot as plt
grammatical_group = []
numbers = []
for gram_group, num in group_counts:
    grammatical_group.append(gram_group)
    numbers.append(num)
    
plt.plot(grammatical_group,numbers)
plt.show()

In [None]:
# divided with the total
gram_procentage = []
for num in numbers:
    gram_procentage.append(num/total)

plt.plot(grammatical_group,gram_procentage)
plt.show()

# En annan variant av plot ifall vi vill ha bar istället /Oscar

In [None]:
fig = plt.figure()
plt.bar(list(zip(*group_counts))[0], list(zip(*group_counts))[1], color='Orange')
plt.grid(axis='y')
plt.xlabel('Grammatical Group')
plt.ylabel('Normalized frequency')
plt.title(f'Total amount of categorized words: {total}',fontsize=10)
plt.suptitle("Frequency of words in grammatical groups")
plt.show()