In [1]:
# Starting SparkSession
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("project").getOrCreate()
sc = spark.sparkContext

In [None]:
# Installing some packages
#!pip install wordcloud
#!pip install seaborn 
#!pip install pyspark_dist_explore

In [151]:
# Possible Choices 
#### 机器学习方面
# Ranking 和其他variable的关系 （ml）
# score和其他variables的关系  （ml）
# 上面的其中一个可以做分类 比如给定一篇文章 他成为top ranking 或者高分文章的可能性有多大
#### 整体趋势方面
# 统计词频 哪些词在哪一年出现的多 可画图 高频词出现变化图etc
# 活跃用户数等


In [6]:
# Importing libraries we need
import pyspark.sql.functions as f
import collections
import re
import numpy as np
import pyspark.sql as SQL
import pandas as pd
import matplotlib.pyplot as plt
import wordcloud
import seaborn as sns
from pyspark_dist_explore import hist
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col
%matplotlib inline

In [3]:
#1. Reading dataset merged dataset
data_df = spark.read\
  .format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('s3://xinyuwang/finalproject/test.csv')

In [260]:
print('The total number of our dataset is {}'.format(data_df.count()))

The total number of our dataset is 56381


In [229]:
df1 = data_df.select(data_df['by'],
                     data_df['score'].cast(IntegerType()),
                     #data_df["time"],
                     data_df['timestamp'].cast('timestamp'),
                     data_df['title'],
                     data_df['type'],
                     data_df['url'],
                     data_df['text'],
                     data_df['parent'],
                     #data_df['deleted'],
                     #data_df['dead'],
                     data_df['descendants'],
                     data_df['id'].cast(IntegerType()),
                     data_df['ranking'].cast(IntegerType())   
                             )

In [221]:
df1.printSchema()

root
 |-- by: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- url: string (nullable = true)
 |-- text: string (nullable = true)
 |-- parent: integer (nullable = true)
 |-- descendants: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- ranking: integer (nullable = true)



In [230]:
# Checking some basic information about our dataset
data = df1.createOrReplaceTempView('data')

In [257]:
# First, let's see about our users

In [231]:
user = spark.sql('SELECT DISTINCT by FROM data')
post_number = df1.count()
user_number = user.count()

In [256]:
print('There are {} posts during this period by {} users. \
\nThe average number of posts per user posted is {}'.format(post_number, user_number, user_number/post_number))

There are 56381 posts during this period by 17822 users. 
The average number of posts per user posted is 0.31609939518632163


In [254]:
# Now, let's see how many posts those active users post 
user2 = spark.sql('SELECT by AS user, COUNT(by) AS count from data GROUP BY by ORDER BY count DESC LIMIT 20')

In [255]:
user2.show()

+-------------+-----+
|         user|count|
+-------------+-----+
|</code></pre>|  463|
|      tptacek|  116|
|     jacquesm|  102|
|      rbanffy|   90|
| dragonwriter|   76|
|         dang|   71|
|        pjmlp|   67|
|  dredmorbius|   60|
|      coldtea|   58|
|     brudgers|   54|
|    anigbrowl|   54|
|      mikeash|   52|
|        DanBC|   51|
|     TeMPOraL|   47|
|         jerf|   47|
|       Retric|   46|
|      rayiner|   44|
|  maxerickson|   43|
|  toomuchtodo|   41|
|  icebraining|   41|
+-------------+-----+



In [261]:
# Now let's see what about the time period

In [247]:
time = df1[df1['timestamp'].isNotNull()]
time = time.createOrReplaceTempView('time')

In [248]:
first_time = spark.sql('select timestamp from time order by timestamp limit 1')
first_time.show()

+-------------------+
|          timestamp|
+-------------------+
|2006-10-10 15:53:53|
+-------------------+



In [249]:
last_time = spark.sql('select timestamp from time order by timestamp desc limit 1')
last_time.show()

+-------------------+
|          timestamp|
+-------------------+
|2019-07-08 08:14:52|
+-------------------+



In [232]:
# Filter Story data 
story = spark.sql('SELECT * FROM data WHERE type == "story"')

In [238]:
story[story['parent'].isNotNull()].show()

+---------------+-----+-------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----+-------+
|             by|score|          timestamp|               title| type|                 url|                text|              parent|         descendants|  id|ranking|
+---------------+-----+-------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----+-------+
|     shizhilvyi|    1|2012-08-06 13:24:53|IBM Will Launch N...|story|http://www.hqew.n...|"According to the...| which will be mo...|                True|  -1|4345052|
|        leeskye|    4|2012-03-28 05:29:18|WPP CEO Martin So...|story|http://adage.com/...|"Who do you agree...|"" said Microsoft...| social benefits ...|null|   null|
|       olegious|   42|2012-06-17 22:13:56|Ask HN: Just orde...|story|                null|"Getting my new M...| VM (will run Ubu...|                null|  64|4

In [234]:
# Filter comment data
comments = spark.sql('SELECT * FROM data WHERE type == "comment"')

In [236]:
comments.show()

+--------------+-----+-------------------+-----+-------+----+--------------------+--------------------+----------------+--------+-------+
|            by|score|          timestamp|title|   type| url|                text|              parent|     descendants|      id|ranking|
+--------------+-----+-------------------+-----+-------+----+--------------------+--------------------+----------------+--------+-------+
|         malyk| null|2017-11-29 20:35:46| null|comment|null|That&#x27;s likel...|          15810293.0|            null|15810594|   null|
|      wbracken| null|2011-10-11 21:56:08| null|comment|null|Been wondering th...|           3100461.0|            null| 3100489|   null|
|         spang| null|2014-10-01 18:51:56| null|comment|null|"Inbox | <a href=...| CA (Mission) or ...| up all the time|    null|   null|
|           nfm| null|2015-11-14 22:25:35| null|comment|null|Yes, yes it is.<p...|          10567471.0|            null|10567544|   null|
|            Mz| null|2010-10-17 2

In [157]:
# Let's check some information about the title column
title_df = df1[df1['title'].isNotNull()]
title_df = title_df.withColumn('title_word_num', f.size(f.split(f.col('title'), ' ')))
title_df.show()

+--------------------+-----+-------------------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+-------+--------------+
|                  by|score|          timestamp|               title|                type|                 url|                text|    parent|descendants|      id|ranking|title_word_num|
+--------------------+-----+-------------------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+-------+--------------+
|              yopiti|    1|2013-04-15 13:22:49|Why disabling 3rd...|               story|http://www.spamco...|                null|      null|        0.0| 5551313|   null|            12|
|        mikethecoach|    1|2017-12-07 19:11:41|How trainers and ...|               story|https://www.podbe...|                null|      null|       null|15873005|   null|            10|
|                frik|    2|2016-01-23 11:00:22|AppDynamics 

In [159]:
title = title_df.createOrReplaceTempView('title')

In [202]:
title_word1 = spark.sql('SELECT AVG(title_word_num) AS avg_num FROM title')
title_word1.show()

+-----------------+
|          avg_num|
+-----------------+
|8.061269656377402|
+-----------------+



In [203]:
title_word2 = spark.sql('SELECT score, AVG(title_word_num) AS avg_num FROM title GROUP BY score ORDER BY score DESC')
title_word2.show()

+-----+-------+
|score|avg_num|
+-----+-------+
| 1129|   13.0|
|  985|    3.0|
|  727|   10.0|
|  640|   13.0|
|  602|    8.5|
|  596|    4.0|
|  488|    1.0|
|  421|    4.0|
|  391|   11.0|
|  373|    6.0|
|  341|   10.5|
|  333|   11.0|
|  332|   10.0|
|  322|    8.0|
|  316|    9.0|
|  311|    8.0|
|  305|    5.0|
|  304|    7.0|
|  294|    4.0|
|  292|    7.0|
+-----+-------+
only showing top 20 rows



In [None]:
# From the table above, we cannot see some clear relationship between those two variables.
# For further analysis, we may convert the score column to a categorical variable since it has wide range.


In [None]:
#今日工作：


In [None]:
#没解决的问题1 ：
#fillna 虽然不fillna也可以filter 但是可能fill了比较好 关联网页在这里
#http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.fillna

#我失败的代码
#df1.fillna({'score': 0})

#2.画图失败 
# 觉得可以加 score 的histgram comments&story按年份的统计图


In [85]:
# Let's making some analysis using this title column
title_list = list(df1.select('title').toPandas()['title'])

  labels, = index.labels


In [90]:
#Since there are none values in this list, we can create a new list to make sure that all elements are not empty
titles = []
for title in title_list:
    if title is None:
        continue
    titles.append(title)
titles[:10]

['Why disabling 3rd party cookies in Firefox 22 is a big mistake',
 'How trainers and coach can help align sales and marketing',
 'AppDynamics CEO: Don’t call my $2B company a unicorn',
 'Regulation free zones to allow testing of drone technology',
 'Technologizing Agriculture',
 'Why Logical Clocks Are Easy',
 'Dynamic and Static Programming Languages and Teaching',
 'Trump flanked by mushroom clouds, Nazi symbols on Phoenix billboard',
 'How Media Fuels Our Fear of Terrorism',
 ' Mono options']