### Key Terms
Calculate the TF-IDF for a given subreddit.
Produce a Tag Cloud of the terms (note: this doesn’t have to be integrated into your code; simply including the image is enough).

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

TF-IDF = TF * IDF

In [1]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType, IntegerType
from  pyspark.sql.functions import input_file_name
from pyspark.sql import SQLContext, Row

df = sqlContext.read.json("hdfs://orion11:32001/sampled_reddit/*")
columns = [
    "distinguished",
    "downs",
    "created_utc",
    "controversiality",
    "edited",
    "gilded",
    "author_flair_css_class",
    "id",
    "author",
    "retrieved_on",
    "score_hidden",
    "subreddit_id",
    "score",
    "name",
    "author_flair_text",
    "link_id",
    "archived",
    "ups",
    "parent_id",
    "subreddit",
    "body"]

df = df.withColumn("filename", input_file_name())
df = df.withColumn("created_utc", df["created_utc"].cast(LongType()))
df.show(n=2)

+--------+----------+--------------+----------------------+-----------------+--------------------+---------+----------------+-------+-----------+-------------+-----+------+------+-------+--------+-----------+----------+---------+--------------+-------+------------+-----+-----+------------+--------+---------+------------+---+------------+--------------------+
|archived|    author|author_cakeday|author_flair_css_class|author_flair_text|                body|body_html|controversiality|created|created_utc|distinguished|downs|edited|gilded|     id| link_id|mod_reports|      name|parent_id|removal_reason|replies|retrieved_on|saved|score|score_hidden|stickied|subreddit|subreddit_id|ups|user_reports|            filename|
+--------+----------+--------------+----------------------+-----------------+--------------------+---------+----------------+-------+-----------+-------------+-----+------+------+-------+--------+-----------+----------+---------+--------------+-------+------------+-----+-----+-

In [2]:
df.createOrReplaceTempView("TEMP_DF")
pd = spark.sql("""select temp_df.filename, temp_df.body from TEMP_DF where temp_df.subreddit = 'gaming'""")

#count number of documents
number_of_docs = df.select("filename").distinct().count()
number_of_docs

2288

In [12]:
pd = spark.sql("""select temp_df.filename, first(temp_df.body) text from TEMP_DF GROUP BY temp_df.filename""").toDF("doc","text")
pd.show(n=3)

+--------------------+--------------------+
|                 doc|                text|
+--------------------+--------------------+
|hdfs://orion11:32...|           [deleted]|
|hdfs://orion11:32...|Personally, I'm g...|
|hdfs://orion11:32...|           [deleted]|
+--------------------+--------------------+
only showing top 3 rows



In [21]:
import re
def word_count(text):
    word_count = {}
    text = text.lower()
    data = re.split(r'\W+', text)
    for word in data:
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1
    return word_count

pand = pd.toPandas()
pand['counted'] = pand['text'].apply(word_count)
pand = pand.drop(columns=['text'])
pand.iloc[:4]

Unnamed: 0,doc,counted
0,hdfs://orion11:32001/sampled_reddit/part-00088...,"{'': 2, 'deleted': 1}"
1,hdfs://orion11:32001/sampled_reddit/part-00329...,"{'personally': 2, 'i': 3, 'm': 1, 'glad': 1, '..."
2,hdfs://orion11:32001/sampled_reddit/part-00392...,"{'': 2, 'deleted': 1}"
3,hdfs://orion11:32001/sampled_reddit/part-00669...,"{'i': 1, 'm': 1, 'suddenly': 1, 'wondering': 1..."


In [22]:
import pandas as pd
rows = []

for row in pand.itertuples():
    series = pd.Series(row.counted)
    df = series.to_frame()
    df['doc'] = row.doc 
    df.columns = ['words', 'doc']
    df = pd.DataFrame(df)
    rows.append(df)
    
result = pd.concat(rows)

print(result['words'].head())

              2
deleted       1
personally    2
i             3
m             1
Name: words, dtype: int64


In [23]:
words = pd.DataFrame(columns=['Words', 'Num', 'Doc'])

for row in result.itertuples():
    words = words.append({'Words': row.Index, 'Num': row.words, 'Doc': row.doc}, ignore_index=True)
    
words.iloc[:4]

Unnamed: 0,Words,Num,Doc
0,,2,hdfs://orion11:32001/sampled_reddit/part-00088...
1,deleted,1,hdfs://orion11:32001/sampled_reddit/part-00088...
2,personally,2,hdfs://orion11:32001/sampled_reddit/part-00329...
3,i,3,hdfs://orion11:32001/sampled_reddit/part-00329...


In [24]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
schema = StructType([StructField("word", StringType(), True), StructField("num", IntegerType(), True), StructField("file", StringType(), True)])
dfNew = spark.createDataFrame(words, schema=schema)
dfNew.show(n=10)


+----------+---+--------------------+
|      word|num|                file|
+----------+---+--------------------+
|          |  2|hdfs://orion11:32...|
|   deleted|  1|hdfs://orion11:32...|
|personally|  2|hdfs://orion11:32...|
|         i|  3|hdfs://orion11:32...|
|         m|  1|hdfs://orion11:32...|
|      glad|  1|hdfs://orion11:32...|
|        to|  4|hdfs://orion11:32...|
|      hear|  1|hdfs://orion11:32...|
|       you|  3|hdfs://orion11:32...|
|        re|  2|hdfs://orion11:32...|
+----------+---+--------------------+
only showing top 10 rows



In [25]:
dfNew.createOrReplaceTempView("TEMP")
pd = spark.sql("""SELECT COUNT(temp.num), temp.word, COUNT(temp.file)
FROM TEMP
GROUP BY temp.word""")
pd.show(n=10)

+----------+---------+-----------+
|count(num)|     word|count(file)|
+----------+---------+-----------+
|       141|     some|        141|
|        51|      few|         51|
|        86|    still|         86|
|        71|    those|         71|
|         9|   online|          9|
|        17|     hope|         17|
|         3|   travel|          3|
|         1|indicator|          1|
|         1|  protoss|          1|
|         4|   harder|          4|
+----------+---------+-----------+
only showing top 10 rows



In [26]:
count_words = pd.select("word").count()
count_words

11019

In [29]:
panda = pd.toPandas()
panda.columns = ['appearance', 'word', 'files']

saveToPic = panda.drop(columns=['files'])
schema = StructType([StructField("num", IntegerType(), True), StructField("word", StringType(), True)])
toPic = spark.createDataFrame(saveToPic, schema=schema)
toPic.write.format('csv').save('hdfs://orion11:32001/key_terms_topic')

In [30]:
import pandas as pd
import math 

col_names = ['result', 'appearance', 'word', 'files', 'TF', 'IDF']
answerLast = pd.DataFrame(columns = col_names)

for row in panda.itertuples():
    TF = int(row.appearance)/count_words
    IDF = math.log(number_of_docs/int(row.files))
    result = TF*IDF
    result = round(result, 5)
    answerLast = answerLast.append({'result' : result, 'appearance': row.appearance, 'word': row.word, 'files': row.files, 'TF' : TF, 'IDF' : IDF }, ignore_index=True)

answerLast = answerLast.drop(columns=['appearance', 'files', 'TF', 'IDF'])
answerLast.iloc[:4]

Unnamed: 0,result,word
0,0.02561,still
1,0.03566,some
2,0.0176,few
3,0.02238,those


In [33]:
answerLast.to_csv('key_terms_results.csv')