In [37]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year

In [28]:
sc

In [29]:
sc.master

'local[*]'

# 1. Big Data Preprocessing

In [30]:
#Reading in csv file from Hadoop
data = spark.read.csv("hdfs://localhost:9000/user1/elonmusk_tweets.csv", header=True, inferSchema=True)

In [35]:
data.head()

Row(id=849636868052275200, created_at='2017-04-05 14:56:29', text="b'And so the robots spared humanity ... https://t.co/v7JUJQWfCv'")

In [36]:
data.show()

+------------------+-------------------+--------------------+
|                id|         created_at|                text|
+------------------+-------------------+--------------------+
|849636868052275200|2017-04-05 14:56:29|b'And so the robo...|
|848988730585096192|2017-04-03 20:01:01|"b""@ForIn2020 @w...|
|848943072423497728|2017-04-03 16:59:35|b'@waltmossberg @...|
|848935705057280001|2017-04-03 16:30:19|b'Stormy weather ...|
|848416049573658624|2017-04-02 06:05:23|"b""@DaveLeeBBC @...|
|848415731502923777|2017-04-02 06:04:07|"b""@Lexxxzis It'...|
|848415356263702528|2017-04-02 06:02:38|"b""@verge It won...|
|848398971139629057|2017-04-02 04:57:31|b'@SuperCoolCube ...|
|848244577521647616|2017-04-01 18:44:01|"b""Why did we wa...|
|848243350993895424|2017-04-01 18:39:09|b'Technology brea...|
|848239928043491328|2017-04-01 18:25:33|"b""RT @OpenAI: W...|
|848239664536223745|2017-04-01 18:24:30|b'RT @ProfBrianCo...|
|848036043240636417|2017-04-01 04:55:23|b'@adamsbj Def P1...|
|8479585

In [38]:
#Creating a Year column for easy access
data = data.withColumn("Year", year("created_at"))

In [42]:
print(type(data))

<class 'pyspark.sql.dataframe.DataFrame'>


In [40]:
#Checking it works
data.show(5)

+------------------+-------------------+--------------------+----+
|                id|         created_at|                text|Year|
+------------------+-------------------+--------------------+----+
|849636868052275200|2017-04-05 14:56:29|b'And so the robo...|2017|
|848988730585096192|2017-04-03 20:01:01|"b""@ForIn2020 @w...|2017|
|848943072423497728|2017-04-03 16:59:35|b'@waltmossberg @...|2017|
|848935705057280001|2017-04-03 16:30:19|b'Stormy weather ...|2017|
|848416049573658624|2017-04-02 06:05:23|"b""@DaveLeeBBC @...|2017|
+------------------+-------------------+--------------------+----+
only showing top 5 rows



In [44]:
#Creating a smaller dataframe to count the tweets per year
yearCounts = data.groupBy("Year").count().orderBy("Year")

In [45]:
yearCounts.show()

+----+-----+
|Year|count|
+----+-----+
|2010|    1|
|2011|   44|
|2012|  316|
|2013|  478|
|2014|  232|
|2015|  436|
|2016|  935|
|2017|  377|
+----+-----+



In [51]:
#Converting to Pandas to write to csv. Going to use this later
years = yearCounts.toPandas()

In [53]:
years.to_csv("YearCount.csv")

In [54]:
data.show()

+------------------+-------------------+--------------------+----+
|                id|         created_at|                text|Year|
+------------------+-------------------+--------------------+----+
|849636868052275200|2017-04-05 14:56:29|b'And so the robo...|2017|
|848988730585096192|2017-04-03 20:01:01|"b""@ForIn2020 @w...|2017|
|848943072423497728|2017-04-03 16:59:35|b'@waltmossberg @...|2017|
|848935705057280001|2017-04-03 16:30:19|b'Stormy weather ...|2017|
|848416049573658624|2017-04-02 06:05:23|"b""@DaveLeeBBC @...|2017|
|848415731502923777|2017-04-02 06:04:07|"b""@Lexxxzis It'...|2017|
|848415356263702528|2017-04-02 06:02:38|"b""@verge It won...|2017|
|848398971139629057|2017-04-02 04:57:31|b'@SuperCoolCube ...|2017|
|848244577521647616|2017-04-01 18:44:01|"b""Why did we wa...|2017|
|848243350993895424|2017-04-01 18:39:09|b'Technology brea...|2017|
|848239928043491328|2017-04-01 18:25:33|"b""RT @OpenAI: W...|2017|
|848239664536223745|2017-04-01 18:24:30|b'RT @ProfBrianCo...|2

In [55]:
#Seeing how many rows I have
numRows = data.count()

In [56]:
numRows

2819

In [57]:
#Dropping "id" column. Have no need for it
data = data.drop("id")

In [59]:
#Checking it worked
data.show(5)

+-------------------+--------------------+----+
|         created_at|                text|Year|
+-------------------+--------------------+----+
|2017-04-05 14:56:29|b'And so the robo...|2017|
|2017-04-03 20:01:01|"b""@ForIn2020 @w...|2017|
|2017-04-03 16:59:35|b'@waltmossberg @...|2017|
|2017-04-03 16:30:19|b'Stormy weather ...|2017|
|2017-04-02 06:05:23|"b""@DaveLeeBBC @...|2017|
+-------------------+--------------------+----+
only showing top 5 rows



In [62]:
#206 has the most data so that's the one I will use
data = data[data['Year'] == 2016]

In [63]:
data.show()

+-------------------+--------------------+----+
|         created_at|                text|Year|
+-------------------+--------------------+----+
|2016-12-31 21:30:05|b'HW2 Autopilot s...|2016|
|2016-12-31 02:47:28|b'@vicentes @Drag...|2016|
|2016-12-31 02:40:28|b'@DragTimes Yes,...|2016|
|2016-12-31 02:34:09|b'Resolving an Au...|2016|
|2016-12-30 19:44:02|b'Churchill (non)...|2016|
|2016-12-30 14:14:30|b'RT @IridiumComm...|2016|
|2016-12-30 00:21:48|b'@andrewket Almo...|2016|
|2016-12-27 21:01:43|b'RT @ElectrekCo:...|2016|
|2016-12-24 23:21:57|b'Deus ex machina...|2016|
|2016-12-24 20:07:00|b'@quipme Occasio...|2016|
|2016-12-24 17:21:58|"b""@djsearle @Fr...|2016|
|2016-12-24 17:18:06|"b""@FredericLamb...|2016|
|2016-12-24 17:15:39|"b""@yamenalhadda...|2016|
|2016-12-24 17:13:25|b'@FredericLamber...|2016|
|2016-12-24 16:54:46|"b""@ddaogaru If ...|2016|
|2016-12-24 16:41:28|b'@wpconner Good ...|2016|
|2016-12-24 16:15:52|    b'@mwangltg Yes'|2016|
|2016-12-24 16:10:41|"b""@SweensChris ..

In [65]:
data.count()

935

In [64]:
tweets = data.toPandas()

In [66]:
tweets.to_csv("CA2Tweets.csv")

Going to finish this assignment in my usual Windows setup as 935 tweets can be handled by Pandas. Just what I'm most comfortable with