In [172]:
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StringType
import re
from pyspark.sql.functions import split
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
import pyspark.sql.functions as f
from pyspark.ml.feature import Word2Vec

In [1]:
# start SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("project").getOrCreate()

# 1. STORIES

In [2]:
# read in the dataset stories
data_df = spark.read\
  .format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('s3://xinyuwang/finalproject/story.csv')

In [3]:
 data_df.show()

+-------+------------+-----+------------+--------------------+--------------------+--------------------+--------------------+-------+----+-----------+------------+
|     id|          by|score|        time|             time_ts|               title|                 url|                text|deleted|dead|descendants|      author|
+-------+------------+-----+------------+--------------------+--------------------+--------------------+--------------------+-------+----+-----------+------------+
|6940813|   sarath237|  0.0|1387536270.0|2013-12-20 10:44:...| Sheryl Brindo Ho...|http://www.youtub...| Sheryl Brindo Ho...|   null|True|       null|   sarath237|
|6991401|123123321321|  0.0|1388508751.0|2013-12-31 16:52:...|Are you people al...|                null|They&#x27;re pret...|   null|True|       null|123123321321|
|1531556|         ssn|  0.0|1279617234.0|2010-07-20 09:13:...|New UI for Google...|http://googlesyst...|Again following o...|   null|null|        0.0|         ssn|
|5012398|       

In [4]:
data_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- by: string (nullable = true)
 |-- score: string (nullable = true)
 |-- time: string (nullable = true)
 |-- time_ts: string (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- text: string (nullable = true)
 |-- deleted: string (nullable = true)
 |-- dead: string (nullable = true)
 |-- descendants: string (nullable = true)
 |-- author: string (nullable = true)



In [5]:
data_df.count()

2069464

In [6]:


story_df = data_df.select(data_df['id'].cast(IntegerType()),
                     data_df['score'].cast(IntegerType()),
                     #data_df["time"],
                     data_df['time_ts'].cast('timestamp'),
                     data_df['title'],
                     #data_df['type'],
                     data_df['url'],
                     data_df['text'],
                     #data_df['parent'].cast(IntegerType()),
                     #data_df['deleted'],
                     #data_df['dead'],
                     data_df['descendants'].cast(IntegerType()),
                     #data_df['id'].cast(IntegerType()),
                     data_df['author']
                             )


In [7]:
story_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- score: integer (nullable = true)
 |-- time_ts: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- text: string (nullable = true)
 |-- descendants: integer (nullable = true)
 |-- author: string (nullable = true)



In [8]:
story_df.show(10)

+-------+-----+-------------------+--------------------+--------------------+--------------------+-----------+------------+
|     id|score|            time_ts|               title|                 url|                text|descendants|      author|
+-------+-----+-------------------+--------------------+--------------------+--------------------+-----------+------------+
|6940813|    0|2013-12-20 10:44:30| Sheryl Brindo Ho...|http://www.youtub...| Sheryl Brindo Ho...|       null|   sarath237|
|6991401|    0|2013-12-31 16:52:31|Are you people al...|                null|They&#x27;re pret...|       null|123123321321|
|1531556|    0|2010-07-20 09:13:54|New UI for Google...|http://googlesyst...|Again following o...|          0|         ssn|
|5012398|    0|2013-01-05 12:11:17|Historic website ...|http://webscrapin...|Python script to ...|          0|        hoju|
|7214182|    0|2014-05-31 18:42:20|         Placeholder|                null|       Mind the gap.|          0|       kogir|
|1187303

In [9]:
# create temp table
story_df.createOrReplaceTempView('ini_story')

In [10]:
story_df.select("id").distinct().count()

1959840

In [11]:
story_df.select("title").distinct().count()

1759584

In [12]:
story_df.select("time_ts").distinct().count()

1925929

In [119]:
story_df.columns

['id', 'score', 'time_ts', 'title', 'url', 'text', 'descendants', 'author']

In [120]:
clean = spark.sql('select * from ini_story where time_ts is not NULL and descendants is not NULL and title is not NULL and author is not NULL and score is not NULL')


In [121]:
clean.count()

1701277

In [13]:
clean_story_df = spark.sql('select * from ini_story where time_ts is not NULL and title is not NULL and author is not NULL and score is not NULL')
clean_story_df.createOrReplaceTempView('story')

In [14]:
url_df = spark.sql('select * from ini_story where url is not NULL and time_ts is not NULL and title is not NULL and author is not NULL and score is not NULL')

In [15]:
url_df.createOrReplaceTempView('url')

In [17]:
import re

In [18]:
pattern = re.compile(r'([a-z]+).(com|net)')

In [None]:
re.findall(pattern, 'http://www.efficientsoftware.net/')

In [19]:
sqlStatement = """
SELECT id, regexp_extract(url, '([a-z]+).(com|net|co|cn|org|ru|de|br|uk|pl|ir|it|in|fr|au|jp|info)', 1) as web
FROM url 
"""

In [20]:
url_df1 = spark.sql(sqlStatement)

In [21]:
url_df1.show()

+-------+-----------------+
|     id|              web|
+-------+-----------------+
|6940813|          youtube|
|1531556|         blogspot|
|5012398|      webscraping|
|2606988|         blogspot|
|4542754|     windowsphone|
|4824675|       abiquolabs|
|1120285|       datingsite|
|5221577|        nobugware|
|4827038|   allelectronics|
|4481018|         blogspot|
|7061983|efficientsoftware|
|1215015|       technobuzz|
|1505655|         zohmuomo|
|4673231| gumrukmusaviritr|
|1503352|       roadtickle|
|1540745|      kronikmedia|
|1370228|       fastessays|
|5917267|     sextoysbrand|
|5969633|    matthewaperry|
| 124722|             blog|
+-------+-----------------+
only showing top 20 rows



In [22]:
url_df1.createOrReplaceTempView('url2')

In [23]:
sqlStatement = """
SELECT web
FROM url2
WHERE web != ''
"""
url_df2 = spark.sql(sqlStatement)

In [24]:
url_df2.count()

1657898

In [25]:
sqlStatement = """
SELECT COUNT(id) AS num, web
FROM url2
WHERE web != '' and web != 'www'
GROUP BY web
ORDER BY num DESC
"""
url_df3 = spark.sql(sqlStatement)

In [26]:
url_df3.show()

+-----+---------------+
|  num|            web|
+-----+---------------+
|36296|         github|
|30887|     techcrunch|
|30856|        youtube|
|28325|       blogspot|
|27659|        nytimes|
|18549|         medium|
|17192|         google|
|14755|            bbc|
|13796|    arstechnica|
|13624|          wired|
|12499|      wordpress|
| 8582|      wikipedia|
| 8156|businessinsider|
| 7434|         forbes|
| 7276|             on|
| 7183|           blog|
| 7110|       mashable|
| 7068|    venturebeat|
| 6812|     thenextweb|
| 6749|       theverge|
+-----+---------------+
only showing top 20 rows



In [27]:
# story distribution in terms of time
# What is the trend of story numbers from 2006-2016 in Hacker News?

sqlStatement = """
SELECT COUNT(id) AS story_num, 
year(time_ts) as year
FROM story
GROUP BY year
ORDER BY year
"""
story_time_df1 = spark.sql(sqlStatement)

In [28]:
story_time_df1.show()

+---------+----+
|story_num|year|
+---------+----+
|       50|2006|
|    21948|2007|
|    68877|2008|
|   109468|2009|
|   175246|2010|
|   285433|2011|
|   305361|2012|
|   303813|2013|
|   287287|2014|
|   250348|2015|
+---------+----+



In [29]:
# story distribution in terms of time
# What is the most popular/active time during a day in pubulishing a story?

sqlStatement = """
SELECT COUNT(id) AS story_num, 
hour(time_ts) as hour
FROM story
GROUP BY hour
ORDER BY story_num DESC
"""
story_time_df2 = spark.sql(sqlStatement)

In [30]:
story_time_df2.show(24)

+---------+----+
|story_num|hour|
+---------+----+
|   118330|  16|
|   116519|  17|
|   114846|  15|
|   111187|  18|
|   107289|  14|
|   101710|  19|
|    95143|  20|
|    92281|  13|
|    87559|  21|
|    75879|  12|
|    75403|  22|
|    64806|  23|
|    62944|  11|
|    58054|   0|
|    55560|  10|
|    54969|   1|
|    54117|   9|
|    53419|   2|
|    52016|   3|
|    51697|   6|
|    51264|   8|
|    51254|   4|
|    50800|   7|
|    50785|   5|
+---------+----+



In [31]:
# score information
# what is the average score of those story?
spark.sql("select min(score),max(score),avg(score) from story").collect()

[Row(min(score)=0, max(score)=4339, avg(score)=10.405862605520095)]

In [32]:
# author/user information 
# decending by total_score 
sqlStatement = """SELECT SUM(score) as total_score,author,
SUM(descendants) as total_decendants,
count(id) as total_stories
FROM story
GROUP BY author
ORDER BY total_score DESC
LIMIT 20
"""
user_df = spark.sql(sqlStatement)
#user_df.createOrReplaceTempView("user")

In [33]:
user_df.show(10)

+-----------+-----------+----------------+-------------+
|total_score|     author|total_decendants|total_stories|
+-----------+-----------+----------------+-------------+
|      76851|ColinWright|           30700|         3726|
|      64156| shawndumas|           25926|         6585|
|      59958|    llambda|           20324|         2595|
|      56798|      fogus|           21136|         2412|
|      53452|      danso|           22792|         2610|
|      52701|       cwan|           24067|         7059|
|      51836|        luu|           19713|         2265|
|      49003|  ssclafani|           24026|         1324|
|      41601|      evo_9|           16215|         5635|
|      41256| Libertatea|           18950|         2238|
+-----------+-----------+----------------+-------------+
only showing top 10 rows



In [34]:
# author/user information 
# decending by total_stories 
sqlStatement = """SELECT SUM(score) as total_score, author,
SUM(descendants) as total_decendants,
count(id) as total_stories
FROM story
GROUP BY author
ORDER BY total_stories DESC
LIMIT 20
"""
user_df2 = spark.sql(sqlStatement)

In [35]:
user_df2.show(10)

+-----------+-----------+----------------+-------------+
|total_score|     author|total_decendants|total_stories|
+-----------+-----------+----------------+-------------+
|      52701|       cwan|           24067|         7059|
|      64156| shawndumas|           25926|         6585|
|      41601|      evo_9|           16215|         5635|
|      29477|      nickb|           11804|         4303|
|      26431|   iProject|           11759|         4262|
|      28454|   bootload|           11351|         4158|
|      29598|     edw519|           13726|         3823|
|      76851|ColinWright|           30700|         3726|
|      29806|     nreece|           12554|         3713|
|      36511| tokenadult|           20190|         3634|
+-----------+-----------+----------------+-------------+
only showing top 10 rows



In [36]:
# author/user information 
# decending by avg_score 
sqlStatement = """SELECT SUM(score) as total_score, author,
SUM(descendants) as total_decendants, SUM(score)/count(id) as avg_score,
count(id) as total_stories
FROM story
GROUP BY author
ORDER BY avg_score DESC
LIMIT 20
"""
user_df3 = spark.sql(sqlStatement)

In [37]:
user_df3.show(10)

+-----------+---------------+----------------+---------+-------------+
|total_score|         author|total_decendants|avg_score|total_stories|
+-----------+---------------+----------------+---------+-------------+
|       1543|    realfuncode|             526|   1543.0|            1|
|       2905|     frederfred|             412|   1452.5|            2|
|       1344| themanthatfell|             407|   1344.0|            1|
|       1282|          rcina|             249|   1282.0|            1|
|       1257|         kvargs|             558|   1257.0|            1|
|       1248|        mmebane|             267|   1248.0|            1|
|       1227|FlemishBeeCycle|             444|   1227.0|            1|
|       1172|     hannahmitt|             136|   1172.0|            1|
|       1125|  afraidofadria|             985|   1125.0|            1|
|       4354|   patricktomas|             385|   1088.5|            4|
+-----------+---------------+----------------+---------+-------------+
only s

In [38]:
# if there is bias on YC
sqlStatement = """
SELECT score, title,id
FROM `story`
WHERE title like "%Y Combinator%" or title like "%YCombinator%" or title like "%ycombinator%" or title like "%y combinator%"
ORDER BY score  DESC
"""
YC_df = spark.sql(sqlStatement)

In [39]:
YC_df.distinct().show()

+-----+--------------------+--------+
|score|               title|      id|
+-----+--------------------+--------+
| 1065|Y Combinator is f...| 5059806|
|  841|Y Combinator has ...| 8033322|
|  705|Meet Watsi, Y Com...| 5117385|
|  687|New: Apply to Y C...| 3700712|
|  634|I Am Sam Altman, ...| 9238839|
|  589|How I Got Kicked ...| 2208155|
|  550|Benefits matter, ...| 5409273|
|  549|What Happens At Y...| 1733236|
|  542|How Y Combinator ...| 3711008|
|  506|Y Combinator Numbers| 2608440|
|  432|New Y Combinator ...| 7972138|
|  425|Yuri Milner, SV A...| 2154706|
|  380|Y Combinator And ...| 8178450|
|  379|How I Crashed and...| 8867335|
|  368|Early Photos of Y...| 2942958|
|  366|Investment Firm Y...| 3492711|
|  344|Y Combinator anno...| 1898432|
|  342|Offer HN now at n...| 1840060|
|  337|A note I sent to ...| 4273460|
|  336|I am Sam Altman, ...|10360911|
+-----+--------------------+--------+
only showing top 20 rows



In [40]:
# stories of major companies over the year
# apple
sqlStatement = """
SELECT count(*) as total_stories, SUM(score) as total_score,
year(time_ts) as year
FROM story
WHERE title like "%apple%" or title like "%APPLE" or title like "%Apple"
GROUP BY year
ORDER BY year
"""
Comp_df1 = spark.sql(sqlStatement)

In [41]:
Comp_df1.distinct().show()

+-------------+-----------+----+
|total_stories|total_score|year|
+-------------+-----------+----+
|            1|          7|2006|
|           28|        109|2007|
|           43|        324|2008|
|           86|        780|2009|
|          194|       1964|2010|
|          348|       5316|2011|
|          382|       4883|2012|
|          280|       2345|2013|
|          203|       2302|2014|
|          168|       3672|2015|
+-------------+-----------+----+



In [42]:
# stories of major companies over the year
# google
sqlStatement = """
SELECT count(*) as total_stories, SUM(score) as total_score,
year(time_ts) as year
FROM story
WHERE title like "%google%" or title like "%GOOGLE" or title like "%Google"
GROUP BY year
ORDER BY year
"""
Comp_df2 = spark.sql(sqlStatement)

In [43]:
Comp_df2.distinct().show()

+-------------+-----------+----+
|total_stories|total_score|year|
+-------------+-----------+----+
|            1|          1|2006|
|          102|        514|2007|
|          190|       1192|2008|
|          428|       3376|2009|
|          553|       6938|2010|
|          506|       9489|2011|
|          461|       7592|2012|
|          478|      10844|2013|
|          432|       7835|2014|
|          369|       9247|2015|
+-------------+-----------+----+



In [44]:
# stories of major companies over the year
# uber
sqlStatement = """
SELECT count(*) as total_stories, SUM(score) as total_score,
year(time_ts) as year
FROM story
WHERE title like "%uber%" or title like "%UBER" or title like "%Uber"
GROUP BY year
ORDER BY year
"""
Comp_df3 = spark.sql(sqlStatement)

In [45]:
Comp_df3.distinct().show()

+-------------+-----------+----+
|total_stories|total_score|year|
+-------------+-----------+----+
|            3|          3|2007|
|           22|        202|2008|
|           23|        957|2009|
|           59|       1110|2010|
|           69|        966|2011|
|           58|        874|2012|
|           67|        523|2013|
|          244|       2743|2014|
|          302|       3301|2015|
+-------------+-----------+----+



# 2. COMMENTS

In [46]:
# read in the dataset comments
data_df2 = spark.read\
  .format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('s3://chingsez/Final/comments.csv')

In [47]:
data_df2.show()

+--------------------+-------+------+----------+--------------------+--------------------+--------------------+--------------------+-------+-------+
|                  id|     by|author|      time|             time_ts|                text|              parent|             deleted|   dead|ranking|
+--------------------+-------+------+----------+--------------------+--------------------+--------------------+--------------------+-------+-------+
|             2701393|     5l|    5l|1309184881|2011-06-27 14:28:...|And the glazier w...|             2701243|                null|   null|      0|
|             5811403|     99|    99|1370234048|2013-06-03 04:34:...|Does canada have ...|             5804452|                null|   null|      0|
|               21623|     AF|    AF|1178992400|2007-05-12 17:53:...|"Speaking of Rail...|               21611|                null|   null|      0|
|            10159727|     EA|    EA|1441206574|2015-09-02 15:09:...|Humans and large ...|            1015

In [48]:
data_df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- by: string (nullable = true)
 |-- author: string (nullable = true)
 |-- time: string (nullable = true)
 |-- time_ts: string (nullable = true)
 |-- text: string (nullable = true)
 |-- parent: string (nullable = true)
 |-- deleted: string (nullable = true)
 |-- dead: string (nullable = true)
 |-- ranking: string (nullable = true)



In [49]:
data_df2.count()

9796725

In [50]:
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType

comment_df = data_df2.select(
                     data_df2['id'].cast(IntegerType()),                    
                     data_df2['time_ts'].cast('timestamp'),                  
                     data_df2['text'],
                     data_df2['parent'].cast(IntegerType()),
                     #data_df2['deleted'],
                     #data_df2['dead'],
                     data_df2['author'],
                     data_df2['ranking'].cast(IntegerType())

                             )

In [51]:
comment_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- time_ts: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- parent: integer (nullable = true)
 |-- author: string (nullable = true)
 |-- ranking: integer (nullable = true)



In [52]:
comment_df.show(10)

+--------+-------------------+--------------------+--------+------+-------+
|      id|            time_ts|                text|  parent|author|ranking|
+--------+-------------------+--------------------+--------+------+-------+
| 2701393|2011-06-27 14:28:01|And the glazier w...| 2701243|    5l|      0|
| 5811403|2013-06-03 04:34:08|Does canada have ...| 5804452|    99|      0|
|   21623|2007-05-12 17:53:20|"Speaking of Rail...|   21611|    AF|      0|
|10159727|2015-09-02 15:09:34|Humans and large ...|10159396|    EA|      0|
| 2988424|2011-09-12 18:53:00|I must say I reac...| 2988179|    Iv|      0|
| 3867418|2012-04-20 11:39:44|&#62; There's a w...| 3867404|    Iv|      0|
| 3925617|2012-05-03 20:26:05|I'm also in this ...| 3924840|    Iv|      0|
| 3107534|2011-10-13 15:34:04|how do you run un...|    null|    Iv|   null|
|    null|               null|                null|    null|  null|   null|
| 8409259|2014-10-04 11:20:47|Polio is not exte...| 8409226|    Iv|      0|
+--------+--

In [53]:
comment_df.createOrReplaceTempView('ini_comment')

In [54]:
comment_df.select("id").distinct().count()

8399564

In [55]:
clean_comment_df = spark.sql('select * from ini_comment where id is not NULL and time_ts is not NULL and text is not NULL and author is not NULL and ranking is not NULL')
clean_comment_df.createOrReplaceTempView('comment')

In [56]:
clean_comment_df.count()

6986995

In [57]:
clean_comment_df.show(10)

+--------+-------------------+--------------------+--------+------+-------+
|      id|            time_ts|                text|  parent|author|ranking|
+--------+-------------------+--------------------+--------+------+-------+
| 2701393|2011-06-27 14:28:01|And the glazier w...| 2701243|    5l|      0|
| 5811403|2013-06-03 04:34:08|Does canada have ...| 5804452|    99|      0|
|   21623|2007-05-12 17:53:20|"Speaking of Rail...|   21611|    AF|      0|
|10159727|2015-09-02 15:09:34|Humans and large ...|10159396|    EA|      0|
| 2988424|2011-09-12 18:53:00|I must say I reac...| 2988179|    Iv|      0|
| 3867418|2012-04-20 11:39:44|&#62; There's a w...| 3867404|    Iv|      0|
| 3925617|2012-05-03 20:26:05|I'm also in this ...| 3924840|    Iv|      0|
| 8409259|2014-10-04 11:20:47|Polio is not exte...| 8409226|    Iv|      0|
|   50570|2007-09-05 17:04:05|It was a risky jo...|   50556|    Jd|      0|
| 2600618|2011-05-30 22:34:14|"Looks good, ther...| 2600609|    Jd|      0|
+--------+--

In [58]:
# comment author/user information 
# who are the most contributive authors in Hacker News ?
## futhur to be done——can be joined with the most contirbutive authors in story

sqlStatement = """SELECT author,
        COUNT(id) AS total_comments
        From comment
        GROUP BY author
        ORDER BY total_comments DESC
        LIMIT 20
"""
comment_user_df = spark.sql(sqlStatement)

In [59]:
# show top 10
comment_user_df.show(10)

+------------+--------------+
|      author|total_comments|
+------------+--------------+
|     tptacek|         28605|
|    jacquesm|         19845|
|       DanBC|         10992|
|    jrockway|         10679|
|   anigbrowl|         10490|
|dragonwriter|         10203|
|         eru|          9979|
|     rbanffy|          9634|
|       sp332|          9547|
|     rayiner|          9403|
+------------+--------------+
only showing top 10 rows



In [60]:
# comment distribution in terms of time
# What is the trend of comment numbers from 2006-2016 in Hacker News?

sqlStatement = """
SELECT COUNT(id) AS comment_num, 
year(time_ts) as year
FROM comment
GROUP BY year
ORDER BY year
"""
comment_time_df2 = spark.sql(sqlStatement)

In [61]:
comment_time_df2.show()

+-----------+----+
|comment_num|year|
+-----------+----+
|         10|2006|
|      50982|2007|
|     195678|2008|
|     382490|2009|
|     658846|2010|
|     816999|2011|
|     975306|2012|
|    1415381|2013|
|    1348324|2014|
|    1142979|2015|
+-----------+----+



In [62]:
# comment distribution in terms of time
# What is the most popular/active time during a day in pubulishing a comment?

sqlStatement = """
SELECT COUNT(id) AS comment_num, 
hour(time_ts) as hour
FROM comment
GROUP BY hour
ORDER BY comment_num DESC
"""
comment_time_df = spark.sql(sqlStatement)

In [63]:
comment_time_df.show(24)

+-----------+----+
|comment_num|hour|
+-----------+----+
|     451259|  17|
|     450094|  18|
|     437369|  16|
|     433640|  19|
|     421504|  20|
|     413518|  15|
|     398429|  21|
|     365798|  14|
|     353708|  22|
|     313262|  23|
|     295136|  13|
|     276703|   0|
|     251256|   1|
|     236036|   2|
|     224956|   3|
|     224367|  12|
|     210740|   4|
|     192913|   5|
|     180795|   6|
|     180141|  11|
|     172147|   7|
|     170884|   8|
|     168104|   9|
|     164236|  10|
+-----------+----+



In [64]:
# top(the hottest) comments with the most follow-up comment 
sqlStatement = """
SELECT COUNT(id) AS followup_num, 
parent
FROM comment
WHERE parent is not NULL
GROUP BY parent
ORDER BY followup_num DESC
LIMIT 20
"""
comment_parent_df1 = spark.sql(sqlStatement)

In [65]:
# show top 10
comment_parent_df1.show(10)

+------------+--------+
|followup_num|  parent|
+------------+--------+
|         975|     363|
|         266| 7469115|
|         266| 9996333|
|         264| 9238839|
|         262| 9812245|
|         243| 7445761|
|         241|10152809|
|         239|  752262|
|         234| 9471287|
|         228| 9303396|
+------------+--------+
only showing top 10 rows



In [66]:
# ranking information
# what is the average ranking of those comments?

spark.sql("select min(ranking),max(ranking),avg(ranking) from comment").collect()

[Row(min(ranking)=-1051, max(ranking)=1131019295, avg(ranking)=58481.1891668736)]

In [67]:
# author/user's ranking information decending by avg ranking 
# who has the highest avg ranking?

sqlStatement = """SELECT avg(ranking) as avg_ranking,author,
count(id) as total_comment
FROM comment
GROUP BY author
ORDER BY avg(ranking) DESC
LIMIT 20
"""
comment_user_df1 = spark.sql(sqlStatement)

In [68]:
# show top 10
comment_user_df1.show(10)

+--------------------+------------+-------------+
|         avg_ranking|      author|total_comment|
+--------------------+------------+-------------+
|         2.0630034E7|AretNCarlsen|           56|
|1.5151532969696969E7|     dedalus|           66|
|         1.0360911E7|      pratim|            1|
|         1.0359641E7|    vmuhonen|            1|
|           1.03454E7|   scopesoft|            1|
|         1.0288613E7|  FlexMonkey|            1|
|         1.0277638E7|   Jimbobian|            1|
|         1.0275866E7| waynebeaton|            1|
|         1.0250132E7| anonttttttt|            1|
|         1.0230206E7|      bduhan|            1|
+--------------------+------------+-------------+
only showing top 10 rows



In [70]:
clean_comment_df.createOrReplaceTempView('tmp')

In [71]:
test = spark.sql('select * from tmp limit 200')

In [72]:
test.show()

+--------+-------------------+--------------------+--------+------+-------+
|      id|            time_ts|                text|  parent|author|ranking|
+--------+-------------------+--------------------+--------+------+-------+
| 2701393|2011-06-27 14:28:01|And the glazier w...| 2701243|    5l|      0|
| 5811403|2013-06-03 04:34:08|Does canada have ...| 5804452|    99|      0|
|   21623|2007-05-12 17:53:20|"Speaking of Rail...|   21611|    AF|      0|
|10159727|2015-09-02 15:09:34|Humans and large ...|10159396|    EA|      0|
| 2988424|2011-09-12 18:53:00|I must say I reac...| 2988179|    Iv|      0|
| 3867418|2012-04-20 11:39:44|&#62; There's a w...| 3867404|    Iv|      0|
| 3925617|2012-05-03 20:26:05|I'm also in this ...| 3924840|    Iv|      0|
| 8409259|2014-10-04 11:20:47|Polio is not exte...| 8409226|    Iv|      0|
|   50570|2007-09-05 17:04:05|It was a risky jo...|   50556|    Jd|      0|
| 2600618|2011-05-30 22:34:14|"Looks good, ther...| 2600609|    Jd|      0|
| 2600423|20

In [73]:
test2 = test.withColumn("split", split("text", "\s+"))

In [74]:
test2.show()

+-------+-------------------+--------------------+-------+-------------+-------+--------------------+
|     id|            time_ts|                text| parent|       author|ranking|               split|
+-------+-------------------+--------------------+-------+-------------+-------+--------------------+
|7279826|2014-02-21 22:15:51|Agreed, it&#x27;s...|7276527| brucefancher|      0|[Agreed,, it&#x27...|
|8543165|2014-11-01 14:36:06|                Yep.|8539350| brucefancher|      0|              [Yep.]|
|7550193|2014-04-07 22:50:23|I think Im just g...|7549551| greatsuccess|      0|[I, think, Im, ju...|
|9325036|2015-04-05 19:26:43|Europe is mostly ...|9324926| happyscrappy|      0|[Europe, is, most...|
|3071755|2011-10-04 16:49:11|"I have reason, t...|   null| intellection|3070715|["I, have, reason...|
|7720156|2014-05-09 10:43:43|hoping for an answer|7719614| jesusisbacks|      0|[hoping, for, an,...|
|9134080|2015-03-02 20:21:54|Step Four: Bro Down!|9133267| markpundmann|      0|[S

In [76]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized = tokenizer.transform(test)
countTokens = udf(lambda words: len(words), IntegerType())
tokenized.select("text", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [77]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\s+")
regexTokenized = regexTokenizer.transform(test)
regexTokenized.select("text", "words").show()

+--------------------+--------------------+
|                text|               words|
+--------------------+--------------------+
|I've been coding ...|[i've, been, codi...|
|I think of Snowde...|[i, think, of, sn...|
|Actually has led ...|[actually, has, l...|
|Interesting siden...|[interesting, sid...|
|Silly geeks.  A w...|[silly, geeks., a...|
|This.  I also lik...|[this., i, also, ...|
|These are just #h...|[these, are, just...|
|Time to kiss Arri...|[time, to, kiss, ...|
|my classmate&#x27...|[my, classmate&#x...|
|"I&#x27;m making ...|["i&#x27;m, makin...|
|I'm a fan of appl...|[i'm, a, fan, of,...|
|It feels good, bu...|[it, feels, good,...|
|               Yawn.|             [yawn.]|
|Yet another right...|[yet, another, ri...|
|Good question. Na...|[good, question.,...|
|"nice. you just p...|["nice., you, jus...|
|Impressed with th...|[impressed, with,...|
|This thread sure ...|[this, thread, su...|
|So women are good...|[so, women, are, ...|
|I am looking for ...|[i, am, lo

In [78]:
tokenized.show(5)

+--------+-------------------+--------------------+--------+------------+-------+--------------------+
|      id|            time_ts|                text|  parent|      author|ranking|               words|
+--------+-------------------+--------------------+--------+------------+-------+--------------------+
|  803232|2009-09-03 19:09:48|I've been coding ...|  802700| heckacopter|      5|[i've, been, codi...|
| 8147414|2014-08-07 12:53:18|I think of Snowde...| 8146987| justnotsure|      5|[i, think, of, sn...|
|10331827|2015-10-05 13:59:13|Actually has led ...|10331623| littletimmy|      5|[actually, has, l...|
| 3058176|2011-09-30 19:04:37|Interesting siden...| 3054310| underdesign|      5|[interesting, sid...|
| 2582615|2011-05-25 02:27:49|Silly geeks.  A w...| 2582002|originalgeek|      5|[silly, geeks., ,...|
+--------+-------------------+--------------------+--------+------------+-------+--------------------+
only showing top 5 rows



In [None]:
# Feature Engineering 

In [None]:
# 时间 day of week,  year month   -finish
# id  hot users   - finish
# title length    - finish
# title hot topics  - finish
# url hot website   - finish
# text length  - finish
# title   -> word2vec  

In [105]:
import datetime
from pyspark.sql.functions import year, month, dayofmonth, dayofweek

In [122]:
test = clean.limit(500)

In [123]:
test.show()

+--------+-----+-------------------+--------------------+--------------------+----+-----------+------+
|      id|score|            time_ts|               title|                 url|text|descendants|author|
+--------+-----+-------------------+--------------------+--------------------+----+-----------+------+
|10229406|    1|2015-09-16 19:59:56|The First Whale: ...|http://www.amnh.o...|null|          0|    Mz|
| 6488704|    1|2013-10-03 13:26:50|Engineers invent ...|http://phys.org/n...|null|          0|    X4|
| 6594546|    1|2013-10-22 20:03:30|IUI - Mobile web ...|http://www.iui-js...|null|          0|    X4|
|  790887|    1|2009-08-28 06:44:26|Dell Screws Up, P...|http://www.busine...|null|          0|    aj|
|  673699|    1|2009-06-25 14:41:06|New Duke Nukem ga...|http://arstechnic...|null|          0|    aj|
|  699911|    1|2009-07-12 07:13:24|Wireless Cybercri...|http://www.foxnew...|null|          0|    aj|
|  676666|    1|2009-06-27 06:26:21|Does Multitasking...|http://www.sitep

In [124]:
test2 = test.withColumn('year', year('time_ts'))
test2 = test2.withColumn('month', month('time_ts'))
test2 = test2.withColumn('dayofweek', dayofweek('time_ts'))

In [125]:
test2.show()

+--------+-----+-------------------+--------------------+--------------------+----+-----------+------+----+-----+---------+
|      id|score|            time_ts|               title|                 url|text|descendants|author|year|month|dayofweek|
+--------+-----+-------------------+--------------------+--------------------+----+-----------+------+----+-----+---------+
|10229406|    1|2015-09-16 19:59:56|The First Whale: ...|http://www.amnh.o...|null|          0|    Mz|2015|    9|        4|
| 6488704|    1|2013-10-03 13:26:50|Engineers invent ...|http://phys.org/n...|null|          0|    X4|2013|   10|        5|
| 6594546|    1|2013-10-22 20:03:30|IUI - Mobile web ...|http://www.iui-js...|null|          0|    X4|2013|   10|        3|
|  790887|    1|2009-08-28 06:44:26|Dell Screws Up, P...|http://www.busine...|null|          0|    aj|2009|    8|        6|
|  673699|    1|2009-06-25 14:41:06|New Duke Nukem ga...|http://arstechnic...|null|          0|    aj|2009|    6|        5|
|  69991

In [134]:
clean.createOrReplaceTempView('test')

In [196]:
sqlStatement = """
SELECT author, COUNT(author) AS num
FROM test
GROUP BY author
ORDER BY num DESC
LIMIT 3000
"""

In [197]:
hot_user = spark.sql(sqlStatement)
hot_user.show()

+-------------+----+
|       author| num|
+-------------+----+
|         cwan|7059|
|   shawndumas|6556|
|        evo_9|5570|
|        nickb|4303|
|     iProject|4262|
|     bootload|4141|
|       edw519|3821|
|  ColinWright|3704|
|       nreece|3697|
|   tokenadult|3625|
|      Garbage|3508|
|         robg|3100|
|DanielRibeiro|2819|
|       Anon84|2760|
|        danso|2598|
|      llambda|2595|
|      jonbaer|2417|
|        fogus|2411|
|   transburgh|2360|
|         wslh|2299|
+-------------+----+
only showing top 20 rows



In [198]:
user = np.array(hot_user.select('author').collect())
'cwan' in user

True

In [199]:
@udf(StringType())
def is_hotuser(username):
    if username in user:
        return '1'
    return '0' 

In [200]:
# for the is_hotuser column
test2 = test2.withColumn('is_hotuser', is_hotuser('author'))

In [201]:
# for the title length column
test2 = test2.withColumn('title_length', f.size(f.split(f.col('title'), ' ')))

In [202]:
test2.createOrReplaceTempView('check')

In [208]:
# checking for the is_hotuser column
sqlStatement = """
SELECT *
FROM check
WHERE is_hotuser == '1'
"""
check = spark.sql(sqlStatement)
check.show()

+-------+-----+-------------------+--------------------+--------------------+--------------------+-----------+-------+----+-----+---------+----------+------------+
|     id|score|            time_ts|               title|                 url|                text|descendants| author|year|month|dayofweek|is_hotuser|title_length|
+-------+-----+-------------------+--------------------+--------------------+--------------------+-----------+-------+----+-----+---------+----------+------------+
|9195013|    1|2015-03-13 00:38:14|Ask HN: How do yo...|                null|There is so much ...|          0| vijayr|2015|    3|        6|         1|           9|
|9282921|    1|2015-03-28 22:08:31|Ask HN: Is there ...|                null|This question is ...|          0| vijayr|2015|    3|        7|         1|          10|
|7248437|    1|2014-02-16 17:52:11|Convergence: What...|http://pando.com/...|It&#x27;s weird. ...|          0| vonnik|2014|    2|        1|         1|           8|
|3353561|    1|2

In [209]:
sqlStatement = """
SELECT id, regexp_extract(url, '([a-z]+).(com|net|co|cn|org|ru|de|br|uk|pl|ir|it|in|fr|au|jp|info)', 1) as web
FROM check 
"""

In [211]:
url_df1 = spark.sql(sqlStatement)
url_df1.createOrReplaceTempView('url')

In [252]:
sqlStatement = """
SELECT COUNT(id) AS num, web
FROM url
WHERE web != '' and web != 'www'
GROUP BY web
ORDER BY num DESC
LIMIT 50
"""
url_df2 = spark.sql(sqlStatement)

In [253]:
hot_domain = np.array(url_df2.select('web').collect())
@udf(StringType())
def from_hot_web(url):
    pattern = re.compile('([a-z]+).(com|net|co|cn|org|ru|de|br|uk|pl|ir|it|in|fr|au|jp|info)')
    result = re.findall(pattern, url)
    if result and result[0] in hot_domain:
        return '1'
    return '0' 

In [254]:
#check for whether its work 这个是加装饰器之前单独测试function的 现在测试会出bug了
#print(from_hot_domain('www.youtube.com'))
#print(from_hot_domain('www.bilibili.com'))

In [255]:
# for the from_top_web column
test2 = test2.withColumn('from_top_web', from_hot_web('url'))

In [257]:
# for the text length column
test2 = test2.withColumn('text_length', f.size(f.split(f.col('text'), ' ')))

In [259]:
@udf(IntegerType())
def transfer(num):
    if num == -1:
        return 0 
    return num

In [260]:
#Dealing with empty text
test2 = test2.withColumn('text_length', transfer('text_length'))

In [262]:
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="words", pattern="\s+")
regexTokenized = regexTokenizer.transform(test2)
regexTokenized.select("title", "words").show()

+--------------------+--------------------+
|               title|               words|
+--------------------+--------------------+
|New UI for Google...|[new, ui, for, go...|
|Historic website ...|[historic, websit...|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|         Placeholder|       [placeholder]|
|Kidney Transplant...|[kidney, transpla...|
|Bouncing Marble f...|[bouncing, marble...|
|Cooking your own ...|[cooking, your, o...|
|Why You Should Re...|[why, you,

In [266]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered = remover.transform(regexTokenized)

In [270]:
word_count = filtered.withColumn('word', f.explode(f.col('filtered')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)

In [273]:
word_list = word_count.limit(200)
hot_words = np.array(word_list.select('word').collect())

In [311]:
def filter_words(array):
    words_list = []
    pattern = re.compile('[a-z]+')
    for word in array:
        result = re.findall(pattern, word[0])
        if result != []:
            words_list.append(result[0])
    return words_list

In [313]:
hot_word = filter_words(hot_words)

In [327]:
@udf(IntegerType())
def count_hot_words(title):
    words = title.split(' ')
    count = 0
    for word in words:
        if word in hot_word:
            count += 1 
    return count 

In [321]:
# Test this function
# print(count_hot_words('I like kaggle', hot_word))
# print(count_hot_words('facebook google hungry', hot_word))

1
2


In [328]:
test2 = test2.withColumn('title_hot_words', count_hot_words('title'))

In [329]:
test2.show()

+-------+-----+-------------------+--------------------+--------------------+----+-----------+-------+----+-----+---------+----------+------------+------------+-----------+---------------+
|     id|score|            time_ts|               title|                 url|text|descendants| author|year|month|dayofweek|is_hotuser|title_length|from_top_web|text_length|title_hot_words|
+-------+-----+-------------------+--------------------+--------------------+----+-----------+-------+----+-----+---------+----------+------------+------------+-----------+---------------+
|3451564|    1|2012-01-11 13:10:37|Second biggest Sp...|http://www.busine...|null|          0|barredo|2012|    1|        4|         1|           9|           0|          0|              0|
|7755578|    1|2014-05-16 14:12:06|Shepherd – Guide ...|http://github.hub...|null|          0|bartman|2014|    5|        6|         0|          11|           0|          0|              2|
|6647790|    1|2013-10-31 16:33:44|What The Healthca...

In [331]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

In [333]:
documentDF.show()

+--------------------+
|                text|
+--------------------+
|[Hi, I, heard, ab...|
|[I, wish, Java, c...|
|[Logistic, regres...|
+--------------------+



In [341]:
regexTokenizer = RegexTokenizer(inputCol="title", outputCol="title_array", pattern="\s+")
regexTokenized = regexTokenizer.transform(test2)

In [342]:
regexTokenized.show()

+-------+-----+-------------------+--------------------+--------------------+----+-----------+-------+----+-----+---------+----------+------------+------------+-----------+---------------+--------------------+
|     id|score|            time_ts|               title|                 url|text|descendants| author|year|month|dayofweek|is_hotuser|title_length|from_top_web|text_length|title_hot_words|         title_array|
+-------+-----+-------------------+--------------------+--------------------+----+-----------+-------+----+-----+---------+----------+------------+------------+-----------+---------------+--------------------+
|3451564|    1|2012-01-11 13:10:37|Second biggest Sp...|http://www.busine...|null|          0|barredo|2012|    1|        4|         1|           9|           0|          0|              0|[second, biggest,...|
|7755578|    1|2014-05-16 14:12:06|Shepherd – Guide ...|http://github.hub...|null|          0|bartman|2014|    5|        6|         0|          11|           0|

In [343]:
word2Vec = Word2Vec(vectorSize=300, minCount=5, seed = 18, inputCol="title_array", outputCol="title_vec")
model = word2Vec.fit(regexTokenized)

result = model.transform(regexTokenized)

In [344]:
result.show()

+--------+-----+-------------------+--------------------+--------------------+----+-----------+------+----+-----+---------+----------+------------+------------+-----------+---------------+--------------------+--------------------+
|      id|score|            time_ts|               title|                 url|text|descendants|author|year|month|dayofweek|is_hotuser|title_length|from_top_web|text_length|title_hot_words|         title_array|           title_vec|
+--------+-----+-------------------+--------------------+--------------------+----+-----------+------+----+-----+---------+----------+------------+------------+-----------+---------------+--------------------+--------------------+
|10229406|    1|2015-09-16 19:59:56|The First Whale: ...|http://www.amnh.o...|null|          0|    Mz|2015|    9|        4|         1|           4|           0|          0|              0|[the, first, whal...|[8.74674529768526...|
| 6488704|    1|2013-10-03 13:26:50|Engineers invent ...|http://phys.org/n..