In [1]:
# start SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("project").getOrCreate()

In [2]:
#1. read in the dataset
data_df = spark.read\
  .format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('s3://bigdataproject233/stories.csv')

In [3]:
 data_df.show()

+-------+------------+-----+------------+--------------------+--------------------+--------------------+--------------------+-------+----+-----------+------------+
|     id|          by|score|        time|             time_ts|               title|                 url|                text|deleted|dead|descendants|      author|
+-------+------------+-----+------------+--------------------+--------------------+--------------------+--------------------+-------+----+-----------+------------+
|6940813|   sarath237|  0.0|1387536270.0|2013-12-20 10:44:...| Sheryl Brindo Ho...|http://www.youtub...| Sheryl Brindo Ho...|   null|True|       null|   sarath237|
|6991401|123123321321|  0.0|1388508751.0|2013-12-31 16:52:...|Are you people al...|                null|They&#x27;re pret...|   null|True|       null|123123321321|
|1531556|         ssn|  0.0|1279617234.0|2010-07-20 09:13:...|New UI for Google...|http://googlesyst...|Again following o...|   null|null|        0.0|         ssn|
|5012398|       

In [4]:
data_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- by: string (nullable = true)
 |-- score: string (nullable = true)
 |-- time: string (nullable = true)
 |-- time_ts: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- text: string (nullable = true)
 |-- deleted: string (nullable = true)
 |-- dead: string (nullable = true)
 |-- descendants: string (nullable = true)
 |-- author: string (nullable = true)



In [6]:
data_df.count()

2069464

In [5]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType

df_total = data_df.select(data_df['id'].cast(IntegerType()),
                     data_df['score'].cast(IntegerType()),
                     #data_df["time"],
                     data_df['time_ts'].cast('timestamp'),
                     data_df['title'],
                     #data_df['type'],
                     data_df['url'],
                     data_df['text'],
                     #data_df['parent'].cast(IntegerType()),
                     #data_df['deleted'],
                     #data_df['dead'],
                     data_df['descendants'].cast(IntegerType()),
                     #data_df['id'].cast(IntegerType()),
                     data_df['author']
                             )


In [7]:
df_total.printSchema()

root
 |-- id: integer (nullable = true)
 |-- score: integer (nullable = true)
 |-- time_ts: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- text: string (nullable = true)
 |-- descendants: integer (nullable = true)
 |-- author: string (nullable = true)



In [11]:
df_total.show(10)

+-------+-----+-------------------+--------------------+--------------------+--------------------+-----------+------------+
|     id|score|            time_ts|               title|                 url|                text|descendants|      author|
+-------+-----+-------------------+--------------------+--------------------+--------------------+-----------+------------+
|6940813|    0|2013-12-20 10:44:30| Sheryl Brindo Ho...|http://www.youtub...| Sheryl Brindo Ho...|       null|   sarath237|
|6991401|    0|2013-12-31 16:52:31|Are you people al...|                null|They&#x27;re pret...|       null|123123321321|
|1531556|    0|2010-07-20 09:13:54|New UI for Google...|http://googlesyst...|Again following o...|          0|         ssn|
|5012398|    0|2013-01-05 12:11:17|Historic website ...|http://webscrapin...|Python script to ...|          0|        hoju|
|7214182|    0|2014-05-31 18:42:20|         Placeholder|                null|       Mind the gap.|          0|       kogir|
|1187303

In [13]:

df_total.createOrReplaceTempView('story')

In [10]:
# Filter Story data 
#story = spark.sql('SELECT * FROM data WHERE type == "story"')
#story.createOrReplaceTempView('story')

In [15]:
df_total.select("id").distinct().count()

1959840

In [16]:
df_total.select("title").distinct().count()

1759584

In [18]:
df_total.select("time_ts").distinct().count()

1925929

In [40]:
title_df = spark.sql('select * from story where title is not NULL and author is not NULL and score is not NULL')

In [41]:
title_df.count()

1807837

In [42]:
title_df.createOrReplaceTempView('title')

In [43]:
# check if there is score as NULL in the dataset where title is not NULL
#spark.sql('select score from title where score is not NULL').count()

1807837

In [42]:
#待完成
#url_df = spark.sql("select score, by as author, title, regexp_extract(url, '/(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)/', 1) as url from title")

In [43]:
#url_df.select('url').show(2)

+---+
|url|
+---+
|   |
|   |
+---+
only showing top 2 rows



In [44]:
#score information
spark.sql("select min(score),max(score),avg(score) from title").collect()

[Row(min(score)=0, max(score)=75248, avg(score)=10.447452950680841)]

In [45]:
# author/user information 
# decending by total_score 
sqlStatement = """SELECT SUM(score) as total_score,author,
SUM(descendants) as total_decendants,
count(id) as total_stories
FROM title
GROUP BY author
ORDER BY total_score DESC
LIMIT 20
"""
user_df = spark.sql(sqlStatement)
user_df.createOrReplaceTempView("user")

In [46]:
user_df.show(10)

+-----------+-----------+----------------+-------------+
|total_score|     author|total_decendants|total_stories|
+-----------+-----------+----------------+-------------+
|      76851|ColinWright|           30700|         3726|
|      75248|          0|         1732237|            0|
|      64156| shawndumas|           25926|         6585|
|      59958|    llambda|           20324|         2595|
|      56798|      fogus|           21136|         2412|
|      53452|      danso|           22792|         2610|
|      52701|       cwan|           24067|         7059|
|      51836|        luu|           19713|         2265|
|      49003|  ssclafani|           24026|         1324|
|      41601|      evo_9|           16215|         5635|
+-----------+-----------+----------------+-------------+
only showing top 10 rows



In [47]:
# author/user information 
# decending by total_stories 
sqlStatement = """SELECT SUM(score) as total_score, author,
SUM(descendants) as total_decendants,
count(id) as total_stories
FROM title
GROUP BY author
ORDER BY total_stories DESC
LIMIT 20
"""
user_df2 = spark.sql(sqlStatement)

In [48]:
user_df2.show(10)

+-----------+-----------+----------------+-------------+
|total_score|     author|total_decendants|total_stories|
+-----------+-----------+----------------+-------------+
|      52701|       cwan|           24067|         7059|
|      64156| shawndumas|           25926|         6585|
|      41601|      evo_9|           16215|         5635|
|      29477|      nickb|           11804|         4303|
|      26431|   iProject|           11759|         4262|
|      28454|   bootload|           11351|         4158|
|      29598|     edw519|           13726|         3823|
|      76851|ColinWright|           30700|         3726|
|      29806|     nreece|           12554|         3713|
|      36511| tokenadult|           20190|         3634|
+-----------+-----------+----------------+-------------+
only showing top 10 rows



In [50]:
# author/user information 
# decending by avg_score 
sqlStatement = """SELECT SUM(score) as total_score, author,
SUM(descendants) as total_decendants, SUM(score)/count(id) as avg_score,
count(id) as total_stories
FROM title
GROUP BY author
ORDER BY avg_score DESC
LIMIT 20
"""
user_df3 = spark.sql(sqlStatement)

In [51]:
user_df3.show(10)

+-----------+---------------+----------------+---------+-------------+
|total_score|         author|total_decendants|avg_score|total_stories|
+-----------+---------------+----------------+---------+-------------+
|       1543|    realfuncode|             526|   1543.0|            1|
|       2905|     frederfred|             412|   1452.5|            2|
|       1344| themanthatfell|             407|   1344.0|            1|
|       1282|          rcina|             249|   1282.0|            1|
|       1257|         kvargs|             558|   1257.0|            1|
|       1248|        mmebane|             267|   1248.0|            1|
|       1227|FlemishBeeCycle|             444|   1227.0|            1|
|       1172|     hannahmitt|             136|   1172.0|            1|
|       1125|  afraidofadria|             985|   1125.0|            1|
|       4354|   patricktomas|             385|   1088.5|            4|
+-----------+---------------+----------------+---------+-------------+
only s

In [52]:
# if there is bias on YC
sqlStatement = """
SELECT score, title,id
FROM `title`
WHERE title like "%Y Combinator%" or title like "%YCombinator%" or title like "%ycombinator%" or title like "%y combinator%"
ORDER BY score  DESC
"""
YC_df = spark.sql(sqlStatement)

In [53]:
YC_df.distinct().show()

+-----+--------------------+--------+
|score|               title|      id|
+-----+--------------------+--------+
| 1065|Y Combinator is f...| 5059806|
|  841|Y Combinator has ...| 8033322|
|  705|Meet Watsi, Y Com...| 5117385|
|  687|New: Apply to Y C...| 3700712|
|  634|I Am Sam Altman, ...| 9238839|
|  589|How I Got Kicked ...| 2208155|
|  550|Benefits matter, ...| 5409273|
|  549|What Happens At Y...| 1733236|
|  542|How Y Combinator ...| 3711008|
|  506|Y Combinator Numbers| 2608440|
|  432|New Y Combinator ...| 7972138|
|  425|Yuri Milner, SV A...| 2154706|
|  380|Y Combinator And ...| 8178450|
|  379|How I Crashed and...| 8867335|
|  368|Early Photos of Y...| 2942958|
|  366|Investment Firm Y...| 3492711|
|  344|Y Combinator anno...| 1898432|
|  342|Offer HN now at n...| 1840060|
|  337|A note I sent to ...| 4273460|
|  336|I am Sam Altman, ...|10360911|
+-----+--------------------+--------+
only showing top 20 rows



In [55]:
# stories of major companies over the year
# apple
sqlStatement = """
SELECT count(*) as total_stories, SUM(score) as total_score,
year(time_ts) as year
FROM title
WHERE title like "%apple%" or title like "%APPLE" or title like "%Apple"
GROUP BY year
ORDER BY year
"""
Comp_df1 = spark.sql(sqlStatement)

In [56]:
Comp_df1.distinct().show()

+-------------+-----------+----+
|total_stories|total_score|year|
+-------------+-----------+----+
|            1|          7|2006|
|           28|        109|2007|
|           43|        324|2008|
|           86|        780|2009|
|          194|       1964|2010|
|          348|       5316|2011|
|          382|       4883|2012|
|          280|       2345|2013|
|          203|       2302|2014|
|          168|       3672|2015|
+-------------+-----------+----+



In [57]:
# stories of major companies over the year
# g
sqlStatement = """
SELECT count(*) as total_stories, SUM(score) as total_score,
year(time_ts) as year
FROM title
WHERE title like "%google%" or title like "%GOOGLE" or title like "%Google"
GROUP BY year
ORDER BY year
"""
Comp_df2 = spark.sql(sqlStatement)

In [58]:
Comp_df2.distinct().show()

+-------------+-----------+----+
|total_stories|total_score|year|
+-------------+-----------+----+
|            1|          1|2006|
|          102|        514|2007|
|          190|       1192|2008|
|          428|       3376|2009|
|          553|       6938|2010|
|          506|       9489|2011|
|          461|       7592|2012|
|          478|      10844|2013|
|          432|       7835|2014|
|          369|       9247|2015|
+-------------+-----------+----+



In [59]:
# stories of major companies over the year
# apple
sqlStatement = """
SELECT count(*) as total_stories, SUM(score) as total_score,
year(time_ts) as year
FROM title
WHERE title like "%uber%" or title like "%UBER" or title like "%Uber"
GROUP BY year
ORDER BY year
"""
Comp_df3 = spark.sql(sqlStatement)

In [None]:
Comp_df3.distinct().show()