In [None]:
# import findspark
# findspark.init('C:\opt\spark\spark-2.2.1-bin-hadoop2.7')

import numpy as np
import pandas as pd

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import functions

from bs4 import BeautifulSoup

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import matplotlib.pyplot as plt
%matplotlib inline

import datetime

In [None]:
sc = SparkContext().getOrCreate()
spark = SparkSession.builder.appName('CommentClean').getOrCreate()

## Constructiveness

In [None]:
inputloc = "comment_w_constructive_tfidfbw.csv"
df = spark.read.csv(inputloc, header="true")

In [None]:
data = df.select('article_id', 'constructiv')
data = data.withColumn("Constructiveness", data["constructiv"].cast(IntegerType()))
data = data.drop(data["constructiv"])
data.show()

In [None]:
data_count = data.groupBy('article_id').count()
data_count = data_count.withColumn("count", data_count["count"].cast(IntegerType()))
data_sum = data.groupBy('article_id').sum('Constructiveness').withColumnRenamed("sum(Constructiveness)", "Constructiveness")

In [None]:
data = data_count.join(data_sum, ['article_id'])

In [None]:
data_con = data.withColumn('Non-Constructive', data['count']-data['Constructiveness']).sort('article_id').drop('count')

In [None]:
data_con.show()

## Sentiment

In [None]:
inputloc = "comment_w_sentiment_bagofword.csv"
df = spark.read.csv(inputloc, header="true")

In [None]:
data = df.select('article_id', 'sentiment')
data = data.withColumn("Sentiment", data["sentiment"].cast(IntegerType()))
data.show()

In [None]:
data_count = data.groupBy('article_id').count()
data_count = data_count.withColumn("count", data_count["count"].cast(IntegerType()))
data_sum = data.groupBy('article_id').sum('Sentiment').withColumnRenamed("sum(Sentiment)", "Sentiment")

In [None]:
data = data_count.join(data_sum, ['article_id'])

In [None]:
data.show()

In [None]:
data_sentiment = data.withColumn('Neg-Sentiment', data['count']-data['Sentiment']).sort('article_id').drop('count')

In [None]:
data_sentiment.show()

## Toxic

In [None]:
inputloc = "comment_w_toxic_tfidfbw.csv"
df = spark.read.csv(inputloc, header="true")

In [None]:
data = df.select('article_id', 'toxic')
data = data.withColumn("Toxic", data["toxic"].cast(IntegerType()))
data.show()

In [None]:
data_count = data.groupBy('article_id').count()
data_count = data_count.withColumn("count", data_count["count"].cast(IntegerType()))
data_sum = data.groupBy('article_id').sum('Toxic').withColumnRenamed("sum(Toxic)", "Toxic")

In [None]:
data = data_count.join(data_sum, ['article_id'])
data.show()

In [None]:
data_toxic = data.withColumn('NonToxic', data['count']-data['Toxic']).sort('article_id').drop('count')
data_toxic.show()

## Merge three DFs

In [None]:
joined_df = data_con.join(data_sentiment, ['article_id'], 'inner')

In [None]:
joined_df.show()

In [None]:
joined_df = joined_df.join(data_toxic, ['article_id'], 'inner')

In [None]:
joined_df.show()

In [None]:
joined_df.write.csv('data.csv')

## Generate JSON file

In [None]:
article_id = joined_df.select("article_id").rdd.flatMap(lambda x: x).collect()
constructive = joined_df.select("Constructiveness").rdd.flatMap(lambda x: x).collect()
non_constructive = joined_df.select("Non-Constructive").rdd.flatMap(lambda x: x).collect()
sentiment = joined_df.select("Sentiment").rdd.flatMap(lambda x: x).collect()
neg_sentiment = joined_df.select("Neg-Sentiment").rdd.flatMap(lambda x: x).collect()
toxic = joined_df.select("Toxic").rdd.flatMap(lambda x: x).collect()
non_toxic = joined_df.select("NonToxic").rdd.flatMap(lambda x: x).collect()

In [None]:
data = {}
for values in zip(article_id, constructive, non_constructive, sentiment, neg_sentiment, toxic, non_toxic):
    article_id = values[0]
    data[article_id] = []
    data[article_id].append({
        "name": "",
        "children": [{
            "name": "Constructiveness",
            "children": [{
                "name": "Constructive " + str(values[1])
            }, {
                "name": "Non Constructive " + str(values[2])
            }]
        }, {
            "name": "Sentiment",
            "children": [{
                "name": "Positive Sentiment " + str(values[3])
            }, {
                "name": "Negative Sentiment " + str(values[4])
            }]
        }, {
            "name": "Toxic",
            "children": [{
                "name": "Toxic " + str(values[5])
            }, {
                "name": "Non Toxic " + str(values[6])
            }]
        }]
    })

In [None]:
import json
with open('data_tree.json', 'w') as outfile:
    json.dump(data, outfile)