# Pyspark syntax

In [1]:
sc

<pyspark.context.SparkContext at 0x102734d30>

In [2]:
sqlContext #spark data table context

<pyspark.sql.context.SQLContext at 0x10795c2b0>

In [3]:
a = range(10)

In [4]:
list(a)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [5]:
rdd = sc.parallelize(a) #lazy send to spark
rdd

PythonRDD[1] at RDD at PythonRDD.scala:48

In [6]:
rdd.first()

0

In [7]:
rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [8]:
rdd =rdd.map(lambda x: x*10) #lazy! it return another rdd
rdd

PythonRDD[3] at RDD at PythonRDD.scala:48

In [9]:
rdd.reduce(lambda x,y:x+y) # not lazy (action)

450

In [10]:
rdd = rdd.filter(lambda x: x>30)
rdd

PythonRDD[5] at RDD at PythonRDD.scala:48

In [11]:
rdd.collect()

[40, 50, 60, 70, 80, 90]

## Exercises
1. Get an RDD with numbers 2 to 10
2. Get all elements that are bigger than 5
3. Get the product of the elements of the result of 2

In [12]:
rdd =  sc.parallelize(range(2,11))
print(rdd.collect())
rdd = rdd.filter(lambda x: x >5)
print(rdd.collect())
rdd = rdd.reduce(lambda x,y: x*y)
print(rdd)

[2, 3, 4, 5, 6, 7, 8, 9, 10]
[6, 7, 8, 9, 10]
30240


## Input

In [13]:
allcsv = sc.textFile("*.csv")

In [14]:
allcsv.first()

'Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context'

In [15]:
allcsv.map(lambda x: x.split(",")).first()

['Crime ID',
 'Month',
 'Reported by',
 'Falls within',
 'Longitude',
 'Latitude',
 'Location',
 'LSOA code',
 'LSOA name',
 'Crime type',
 'Last outcome category',
 'Context']

## Tuples and ReduceByKey
First element of a tuple is considered as key

In [16]:
data = [['Alexandra','31','F','Python'],['Carla','25','F','C'],['Max','18','M','Scala'],['Tom','34','M','C'],['Philip','28','M','Python'],['Lucy','25','F','Scala'],['Al','18','M','Scala'],['Grace','34','F','Python']]

In [17]:
data

[['Alexandra', '31', 'F', 'Python'],
 ['Carla', '25', 'F', 'C'],
 ['Max', '18', 'M', 'Scala'],
 ['Tom', '34', 'M', 'C'],
 ['Philip', '28', 'M', 'Python'],
 ['Lucy', '25', 'F', 'Scala'],
 ['Al', '18', 'M', 'Scala'],
 ['Grace', '34', 'F', 'Python']]

In [18]:
RDD = sc.parallelize(data) #send data to spark

In [19]:
help(RDD.reduceByKey) #exit with q

Help on method reduceByKey in module pyspark.rdd:

reduceByKey(func, numPartitions=None, partitionFunc=<function portable_hash at 0x107060a60>) method of pyspark.rdd.RDD instance
    Merge the values for each key using an associative and commutative reduce function.
    
    This will also perform the merging locally on each mapper before
    sending results to a reducer, similarly to a "combiner" in MapReduce.
    
    Output will be partitioned with C{numPartitions} partitions, or
    the default parallelism level if C{numPartitions} is not specified.
    Default partitioner is hash-partition.
    
    >>> from operator import add
    >>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
    >>> sorted(rdd.reduceByKey(add).collect())
    [('a', 2), ('b', 1)]



In [20]:
sumByGender = RDD.map(lambda t: (t[2],1)).reduceByKey(lambda x,y: x+y)
sumByGender.collect()

[('F', 4), ('M', 4)]

In [21]:
languageAndAge = RDD.map(lambda t: (t[3],int(t[1])))
languageAndAge.collect()

[('Python', 31),
 ('C', 25),
 ('Scala', 18),
 ('C', 34),
 ('Python', 28),
 ('Scala', 25),
 ('Scala', 18),
 ('Python', 34)]

In [22]:
languageAndAge.reduceByKey(lambda x,y:x+y).collect()

[('C', 59), ('Scala', 61), ('Python', 93)]

## Getting the average

In [23]:
temp = RDD.map(lambda t: (t[3],(int(t[1]),1)))
temp.collect()

[('Python', (31, 1)),
 ('C', (25, 1)),
 ('Scala', (18, 1)),
 ('C', (34, 1)),
 ('Python', (28, 1)),
 ('Scala', (25, 1)),
 ('Scala', (18, 1)),
 ('Python', (34, 1))]

In [24]:
temp2 = temp.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
temp2.collect()

[('C', (59, 2)), ('Scala', (61, 3)), ('Python', (93, 3))]

In [25]:
temp2.map(lambda x:(x[0],x[1][0]/x[1][1])).collect()

[('C', 29.5), ('Scala', 20.333333333333332), ('Python', 31.0)]

## Exercise
1. Compute the average age by gender (the key is the first element in the tuple)
2. Compute the preferred language by gender (use a tuple as a key)

In [26]:
temp = RDD.map(lambda t: (t[2],(int(t[1]),1)))
temp.collect()

[('F', (31, 1)),
 ('F', (25, 1)),
 ('M', (18, 1)),
 ('M', (34, 1)),
 ('M', (28, 1)),
 ('F', (25, 1)),
 ('M', (18, 1)),
 ('F', (34, 1))]

In [27]:
temp2 = temp.reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1]))
temp2.collect()

[('F', (115, 4)), ('M', (98, 4))]

In [28]:
temp2.map(lambda x: (x[0],x[1][0]/x[1][1])).collect()

[('F', 28.75), ('M', 24.5)]

Compute the preferred language by gender (use a tuple as a key)

In [29]:
temp = RDD.map(lambda t: ((t[2],t[3]),1))
temp.collect()

[(('F', 'Python'), 1),
 (('F', 'C'), 1),
 (('M', 'Scala'), 1),
 (('M', 'C'), 1),
 (('M', 'Python'), 1),
 (('F', 'Scala'), 1),
 (('M', 'Scala'), 1),
 (('F', 'Python'), 1)]

In [30]:
temp2 = temp.reduceByKey(lambda x,y: x+y)
temp2.collect()

[(('F', 'Python'), 2),
 (('M', 'Python'), 1),
 (('M', 'C'), 1),
 (('F', 'C'), 1),
 (('F', 'Scala'), 1),
 (('M', 'Scala'), 2)]

In [31]:
temp3 = temp2.map(lambda x: (x[0][0],(x[0][1],x[1])))
temp3.collect()

[('F', ('Python', 2)),
 ('M', ('Python', 1)),
 ('M', ('C', 1)),
 ('F', ('C', 1)),
 ('F', ('Scala', 1)),
 ('M', ('Scala', 2))]

In [32]:
temp3.reduceByKey(max).collect()

[('F', ('Scala', 1)), ('M', ('Scala', 2))]

Numer of clusters:

In [33]:
sc._jsc.sc().getExecutorMemoryStatus().size()

1

# Pyspark SQL and Dataframes

In [34]:
df = sqlContext.read.format('com.databricks.spark.csv').options(delimiter=',',header='true', inferschema='true',mode="FAILFAST").load('./crime.csv')

In [35]:
df.show()

+--------------------+-------+--------------------+--------------------+---------+---------+--------------------+---------+--------------------+--------------------+---------------------+-------+
|            Crime ID|  Month|         Reported by|        Falls within|Longitude| Latitude|            Location|LSOA code|           LSOA name|          Crime type|Last outcome category|Context|
+--------------------+-------+--------------------+--------------------+---------+---------+--------------------+---------+--------------------+--------------------+---------------------+-------+
|6ce50abd0bf1ca408...|2016-12|Avon and Somerset...|Avon and Somerset...|-2.511571|51.414895|On or near Orchar...|E01014399|Bath and North Ea...|Criminal damage a...|  Under investigation|   null|
|6e15f8dd5c88a65c2...|2016-12|Avon and Somerset...|Avon and Somerset...|-2.516919|51.423683|    On or near A4175|E01014399|Bath and North Ea...|Violence and sexu...|  Under investigation|   null|
|2594621f67f0a2192..

In [36]:
df.select('Month','Crime type').show()

+-------+--------------------+
|  Month|          Crime type|
+-------+--------------------+
|2016-12|Criminal damage a...|
|2016-12|Violence and sexu...|
|2016-12|Violence and sexu...|
|2016-12|Violence and sexu...|
|2016-12|         Other crime|
|2016-12|Anti-social behav...|
|2016-12|Anti-social behav...|
|2016-12|Anti-social behav...|
|2016-12|Anti-social behav...|
|2016-12|       Bicycle theft|
|2016-12|Criminal damage a...|
|2016-12|         Other theft|
|2016-12|         Other theft|
|2016-12|         Other theft|
|2016-12|         Shoplifting|
|2016-12|Violence and sexu...|
|2016-12|Violence and sexu...|
|2016-12|Violence and sexu...|
|2016-12|Anti-social behav...|
|2016-12|Anti-social behav...|
+-------+--------------------+
only showing top 20 rows



In [37]:
df.printSchema()

root
 |-- Crime ID: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Reported by: string (nullable = true)
 |-- Falls within: string (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LSOA code: string (nullable = true)
 |-- LSOA name: string (nullable = true)
 |-- Crime type: string (nullable = true)
 |-- Last outcome category: string (nullable = true)
 |-- Context: string (nullable = true)



In [38]:
# convert A as string
from pyspark.sql.types import *
df.withColumn('Latitude',df['Latitude'].cast(StringType()))

DataFrame[Crime ID: string, Month: string, Reported by: string, Falls within: string, Longitude: double, Latitude: string, Location: string, LSOA code: string, LSOA name: string, Crime type: string, Last outcome category: string, Context: string]

In [39]:
df = df.withColumn('Month',df['Month'].cast(DateType()))
df.printSchema()

root
 |-- Crime ID: string (nullable = true)
 |-- Month: date (nullable = true)
 |-- Reported by: string (nullable = true)
 |-- Falls within: string (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LSOA code: string (nullable = true)
 |-- LSOA name: string (nullable = true)
 |-- Crime type: string (nullable = true)
 |-- Last outcome category: string (nullable = true)
 |-- Context: string (nullable = true)



In [40]:
from pyspark.sql.functions import mean, min, max
df.select(min('Month'),max('Month')).show()

+----------+----------+
|min(Month)|max(Month)|
+----------+----------+
|2016-12-01|2016-12-01|
+----------+----------+



see https://databricks.com/blog/2015/06/02/statistical-and-mathematical-functions-with-dataframes-in-spark.html

In [41]:
df.stat.crosstab("LSOA name", "Crime Type").show() 

+--------------------+---------------------+-------------+--------+-------------------------+-----+-----------+-----------+---------------------+------------+-------+-----------+---------------------+-------------+----------------------------+
|LSOA name_Crime Type|Anti-social behaviour|Bicycle theft|Burglary|Criminal damage and arson|Drugs|Other crime|Other theft|Possession of weapons|Public order|Robbery|Shoplifting|Theft from the person|Vehicle crime|Violence and sexual offences|
+--------------------+---------------------+-------------+--------+-------------------------+-----+-----------+-----------+---------------------+------------+-------+-----------+---------------------+-------------+----------------------------+
|South Gloucesters...|                    1|            0|       0|                        0|    0|          0|          2|                    0|           0|      0|          0|                    0|            0|                           5|
|         Mendip 002A|  

We can go to http://bigdata1.sheffield.ac.uk:50070/explorer.html#/data/ukpolice to see the hfs filesystem

In [45]:
df.filter(df["Crime type"] == 'Burglary').select(df['Crime type'],df.Latitude,df.Longitude).show()

+----------+---------+---------+
|Crime type| Latitude|Longitude|
+----------+---------+---------+
|  Burglary| 51.41364|-2.498127|
|  Burglary|51.395315|-2.391594|
|  Burglary|51.392676|-2.350423|
|  Burglary|51.388973|-2.352608|
|  Burglary|51.391003|-2.356346|
|  Burglary|51.391003|-2.356346|
|  Burglary|51.386211|-2.359211|
|  Burglary|51.380421|-2.358907|
|  Burglary|51.386211|-2.359211|
|  Burglary|51.389609|-2.390367|
|  Burglary|51.383968| -2.36339|
|  Burglary|51.381522|-2.363543|
|  Burglary| 51.38013|-2.365745|
|  Burglary|51.383162|-2.368039|
|  Burglary|51.383414|-2.370958|
|  Burglary|51.390704|-2.319911|
|  Burglary|51.328447|-2.371088|
|  Burglary|51.323676|-2.369973|
|  Burglary| 51.37917|-2.392864|
|  Burglary|51.374852|-2.382799|
+----------+---------+---------+
only showing top 20 rows



In [46]:
df.filter(df["Crime type"] == 'Burglary').select(df['Crime type'],df.Latitude,df.Longitude).explain()

== Physical Plan ==
*Project [Crime type#9, Latitude#5, Longitude#4]
+- *Filter (isnotnull(Crime type#9) && (Crime type#9 = Burglary))
   +- *FileScan csv [Longitude#4,Latitude#5,Crime type#9] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/alessandro/Desktop/pyspark-tutorial-solved-master 2/crime.csv], PartitionFilters: [], PushedFilters: [IsNotNull(Crime type), EqualTo(Crime type,Burglary)], ReadSchema: struct<Longitude:double,Latitude:double,Crime type:string>


In [47]:
#or equivalently
sqlContext.registerDataFrameAsTable(df, "table1")
sqlContext.sql('select `Crime type` Latitude, Longitude from table1 where `Crime type` == "Burglary"').show()

+--------+---------+
|Latitude|Longitude|
+--------+---------+
|Burglary|-2.498127|
|Burglary|-2.391594|
|Burglary|-2.350423|
|Burglary|-2.352608|
|Burglary|-2.356346|
|Burglary|-2.356346|
|Burglary|-2.359211|
|Burglary|-2.358907|
|Burglary|-2.359211|
|Burglary|-2.390367|
|Burglary| -2.36339|
|Burglary|-2.363543|
|Burglary|-2.365745|
|Burglary|-2.368039|
|Burglary|-2.370958|
|Burglary|-2.319911|
|Burglary|-2.371088|
|Burglary|-2.369973|
|Burglary|-2.392864|
|Burglary|-2.382799|
+--------+---------+
only showing top 20 rows



In [48]:
sqlContext.sql('select `Crime type` Latitude, Longitude from table1 where `Crime type` == "Burglary"').explain()

== Physical Plan ==
*Project [Crime type#9 AS Latitude#265, Longitude#4]
+- *Filter (isnotnull(Crime type#9) && (Crime type#9 = Burglary))
   +- *FileScan csv [Longitude#4,Crime type#9] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/alessandro/Desktop/pyspark-tutorial-solved-master 2/crime.csv], PartitionFilters: [], PushedFilters: [IsNotNull(Crime type), EqualTo(Crime type,Burglary)], ReadSchema: struct<Longitude:double,Crime type:string>


## Caching

In [49]:
df.cache() # put the df in cache and results will be cached too (try to run a count twice after this)
df.count()

13750

In [50]:
df.count()

13750

In [51]:
# adding columns and keeping existing ones F.lit(0) return a column
from pyspark.sql import functions as F
df.withColumn('zero', F.lit(0))
df.select('Longitude','Latitude').withColumn('Longitude_times_two', df.Longitude * 2).show()

+---------+---------+-------------------+
|Longitude| Latitude|Longitude_times_two|
+---------+---------+-------------------+
|-2.511571|51.414895|          -5.023142|
|-2.516919|51.423683|          -5.033838|
|-2.511571|51.414895|          -5.023142|
|-2.495055|51.422132|           -4.99011|
|-2.509126|51.416137|          -5.018252|
|-2.498613|51.416002|          -4.997226|
|-2.497767|51.420232|          -4.995534|
| -2.49991|51.413623|           -4.99982|
| -2.49793|51.417966|           -4.99586|
|-2.494715|51.419948|           -4.98943|
|-2.498613|51.416002|          -4.997226|
|-2.501425|51.416692|           -5.00285|
|-2.497767|51.420232|          -4.995534|
|-2.497799|51.415233|          -4.995598|
| -2.49854|51.414618|           -4.99708|
|-2.504289| 51.41828|          -5.008578|
|-2.501425|51.416692|           -5.00285|
|-2.499922|51.417373|          -4.999844|
|-2.506762|51.409116|          -5.013524|
|-2.506762|51.409116|          -5.013524|
+---------+---------+-------------

In [52]:
from pyspark.sql.functions import col, first, last, sum, count, countDistinct, desc #*
# selecting columns, and creating new ones
df.select('Latitude', col('Latitude').alias('new_Lat'), (col('Longitude') < 0 ).alias('negative_long')).show()

+---------+---------+-------------+
| Latitude|  new_Lat|negative_long|
+---------+---------+-------------+
|51.414895|51.414895|         true|
|51.423683|51.423683|         true|
|51.414895|51.414895|         true|
|51.422132|51.422132|         true|
|51.416137|51.416137|         true|
|51.416002|51.416002|         true|
|51.420232|51.420232|         true|
|51.413623|51.413623|         true|
|51.417966|51.417966|         true|
|51.419948|51.419948|         true|
|51.416002|51.416002|         true|
|51.416692|51.416692|         true|
|51.420232|51.420232|         true|
|51.415233|51.415233|         true|
|51.414618|51.414618|         true|
| 51.41828| 51.41828|         true|
|51.416692|51.416692|         true|
|51.417373|51.417373|         true|
|51.409116|51.409116|         true|
|51.409116|51.409116|         true|
+---------+---------+-------------+
only showing top 20 rows



In [53]:
df.printSchema()

root
 |-- Crime ID: string (nullable = true)
 |-- Month: date (nullable = true)
 |-- Reported by: string (nullable = true)
 |-- Falls within: string (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LSOA code: string (nullable = true)
 |-- LSOA name: string (nullable = true)
 |-- Crime type: string (nullable = true)
 |-- Last outcome category: string (nullable = true)
 |-- Context: string (nullable = true)



In [54]:
df.groupBy('Crime type').agg(first('Last outcome category').alias("status")).show()

+--------------------+--------------------+
|          Crime type|              status|
+--------------------+--------------------+
|       Bicycle theft| Under investigation|
|        Public order| Under investigation|
|               Drugs|Awaiting court ou...|
|         Other crime| Under investigation|
|             Robbery| Under investigation|
|Criminal damage a...| Under investigation|
|Theft from the pe...| Under investigation|
|         Shoplifting| Under investigation|
|            Burglary| Under investigation|
|         Other theft| Under investigation|
|Possession of wea...|Awaiting court ou...|
|Violence and sexu...| Under investigation|
|       Vehicle crime| Under investigation|
|Anti-social behav...|                null|
+--------------------+--------------------+



## Exercise
1. Show how many crimes we have for each crime type (hint: use groupby, agg and count)
2. Show how many *distinct*  'Last outcome category' we have for each Crime type
3. Show how many crimes we have for each LSOA code and crime type (hint: groupy by two keys)

## Exercise

1. show the LSOA names where the number of crimes is bigger than 100 (use groupby count and where)
2. sort them by count of crimes
3. see help(df.stat.freqItems) and show the crimes and lsoa name appearing  more than 30% (hint support is 0.3, use show(truncate=False) to see the result)

# Twitter Data

In [None]:
df = sqlContext.read.json('/data/INF6032Coursework/statuses.log.2014-12-30.gz') # you can use wildcards to load multiple files

In [82]:
df = df.limit(5000)

In [141]:
df2 = df2.select(["user","entities", "lang", "retweeted", "favorited","text"])

In [142]:
df2.printSchema()

root
 |-- user: struct (nullable = true)
 |    |-- contributors_enabled: boolean (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- default_profile: boolean (nullable = true)
 |    |-- default_profile_image: boolean (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- favourites_count: long (nullable = true)
 |    |-- follow_request_sent: string (nullable = true)
 |    |-- followers_count: long (nullable = true)
 |    |-- following: string (nullable = true)
 |    |-- friends_count: long (nullable = true)
 |    |-- geo_enabled: boolean (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- id_str: string (nullable = true)
 |    |-- is_translator: boolean (nullable = true)
 |    |-- lang: string (nullable = true)
 |    |-- listed_count: long (nullable = true)
 |    |-- location: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- notifications: string (nullable = true)
 |    |-- profile_background_color: str

In [143]:
# view the contents of a column
print(df2.select(['user.id','text']).show())

+----------+--------------------+
|        id|                text|
+----------+--------------------+
| 320075522|2.5C 85%RH
WChill...|
|1464742728|RT @johnny_cidade...|
|1729753068|RT @basakburcuyuz...|
| 384026199|@willylevy29 te e...|
| 211369327|@elNinoRodriguez ...|
| 134842190|RT @XimePonceLeon...|
|  16409225|RT @ComplexMag: R...|
| 510414395|@maumachado7 cheg...|
| 709905230|watching $XOXO ha...|
| 231423897|@ddlovato I LOVE ...|
|2891486949|"Osmanlının eski ...|
| 394122530|RT @AtillaTasNet:...|
| 111785560|               Voooa|
|2613730124|@Leeds_Demon @Rev...|
| 349172091|http://t.co/xkAbL...|
| 275424344|{وإذ صرفنا إليك ن...|
|2892644994|اللهمّ زد قلبي حب...|
|  45019996|Apple to keep cen...|
|2873037495|RT @bi3f2: أرخص ا...|
|2283309781|RT @Laudya_Cynth1...|
+----------+--------------------+
only showing top 20 rows

None


In [144]:
# list the most common languages
print(df.groupby('lang').count().sort(desc('count')).show())

+----+-----+
|lang|count|
+----+-----+
|null| 2750|
|  en|  864|
|  ja|  332|
|  es|  254|
|  ar|  201|
|  pt|  137|
| und|  108|
|  ru|   78|
|  fr|   62|
|  in|   61|
|  tr|   40|
|  it|   28|
|  ko|   14|
|  nl|   10|
|  et|   10|
|  de|    9|
|  tl|    6|
|  pl|    6|
|  zh|    3|
|  bg|    3|
+----+-----+
only showing top 20 rows

None


In [145]:
# get a dataframe of the user data
user_df = df2.select('user.*')

In [146]:
user_df.select(["id","created_at","location","screen_name"]).show()

+----------+--------------------+--------------------+---------------+
|        id|          created_at|            location|    screen_name|
+----------+--------------------+--------------------+---------------+
| 320075522|Sun Jun 19 07:49:...|South Lakes, Cumbria| CumbriaWeather|
|1464742728|Tue May 28 14:02:...|          margem sul| barbarafchanel|
|1729753068|Wed Sep 04 20:11:...|                    |        biyokin|
| 384026199|Sun Oct 02 23:10:...|          Araraquara|      patyleyte|
| 211369327|Wed Nov 03 01:58:...|        Buenos Aires|    condobleene|
| 134842190|Mon Apr 19 16:10:...|                    |GinaGodoyAndrad|
|  16409225|Mon Sep 22 20:19:...|           Same City|       aehorton|
| 510414395|Thu Mar 01 15:37:...|      Uruguay/Rivera|       jorge_o7|
| 709905230|Sun Jul 22 01:56:...|    Neckar York City|  NeckarPlunger|
| 231423897|Tue Dec 28 13:43:...|                    |   lovaticadeIa|
|2891486949|Tue Nov 25 01:01:...|                bakü|     TalanFahri|
| 3941

In [147]:
# count of tweets per user
tweets = user_df.groupby('screen_name').count()

In [148]:
# show how many times the most active users have tweeted
tweets.sort(desc('count')).show()

+---------------+-----+
|    screen_name|count|
+---------------+-----+
|     VxTwitPr05|    4|
|      lawleywow|    3|
|      KisTamaRT|    2|
|      abboafeef|    2|
|         AntNom|    2|
|        imxeex_|    2|
|    funnyfranta|    2|
|    clemensfalk|    2|
|     Unbrokkken|    2|
| akuinisiapalah|    2|
|dameLyudmilabot|    2|
|DeadpoolTheBest|    2|
|        drodhen|    2|
|    seashell_98|    2|
|   Domipictures|    2|
| CumbriaWeather|    1|
| barbarafchanel|    1|
|        biyokin|    1|
|    condobleene|    1|
|      patyleyte|    1|
+---------------+-----+
only showing top 20 rows



In [149]:
# show the mean number of tweets per person
tweeters.select(mean('count')).show()

+-----------------+
|       avg(count)|
+-----------------+
|2.239140170174653|
+-----------------+



In [150]:
# get a dataframe of the users and the retweeted / favorited status
# This allows us to select values that are contained in nested dataframes
refined_df = df.select(['user.favourites_count', 'user.followers_count',
                        'user.friends_count', 'user.statuses_count', 'lang',
                        'retweeted', 'favorited'])
refined_df.describe().show()

+-------+------------------+-----------------+-----------------+------------------+----+
|summary|  favourites_count|  followers_count|    friends_count|    statuses_count|lang|
+-------+------------------+-----------------+-----------------+------------------+----+
|  count|              2250|             2250|             2250|              2250|2250|
|   mean|3638.6657777777777|4495.716444444444|1364.255111111111|24759.810222222222|null|
| stddev| 15508.34313676943|35566.39674557091|5158.005672569562| 62757.04774486603|null|
|    min|                 0|                0|                0|                 1|  ar|
|    max|            391348|           901263|            84651|           1132818|  zh|
+-------+------------------+-----------------+-----------------+------------------+----+



In [156]:
#match hashtag or specific text
df2.filter(col('text').rlike("(?i)(test|#hi).*")).show()

+--------------------+--------------------+----+---------+---------+--------------------+
|                user|            entities|lang|retweeted|favorited|                text|
+--------------------+--------------------+----+---------+---------+--------------------+
|[false,Tue Dec 09...|[WrappedArray([Wr...| und|    false|    false|@VxTwitPr06 Hi @V...|
|[false,Tue Dec 09...|[WrappedArray([Wr...| und|    false|    false|@VxTwitPr06 Hi @V...|
|[false,Tue Dec 09...|[WrappedArray([Wr...| und|    false|    false|@VxTwitPr06 Hi @V...|
|[false,Fri Jan 31...|[WrappedArray([Wr...|  in|    false|    false|Berharap tetap di...|
|[false,Tue Dec 09...|[WrappedArray([Wr...| und|    false|    false|@VxTwitPr06 Hi @V...|
|[false,Wed May 22...|[WrappedArray(),n...|  en|    false|    false|RT @bendragonborn...|
|[false,Mon Dec 22...|[WrappedArray([Wr...|  en|    false|    false|@KenyanExport Can...|
|[false,Tue Dec 09...|[WrappedArray(),n...|  en|    false|    false|RT @BobBurg: The ...|
|[false,Mo

see http://www.regex101.com

## Exercise

1. Create a dataframe (from df) containing only the user information (nested) and the columns "retweeted" and "text"
2. remove the rows that are null in the column "text"
3. filter only the rows that has "retweeted" == False
4. show the text of the previous step
5. show the text from previous step that contains the word johnny

## File creation and batch submission
- create a text file with the following text:


```
"""Example code for running pyspark on the twitter data set."""

from __future__ import print_function
import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext, DataFrame
from pyspark.sql.functions import desc, mean

sc = SparkContext()
sqlContext = SQLContext(sc)

df = sqlContext.read.json('/data/INF6032Coursework/statuses.log.2014-12-30.gz')

# clean df and select the columns are are interested in
df = df.na.drop(subset=["user.id"]).select(["user","entities", "lang", "retweeted", "favorited"])

# take only the first 5000 rows
df = df.limit(5000)

print(df.show())
```

- add your code to the end
- name it with .py extension
- upload it to http://bigdata1.sheffield.ac.uk:8888/filebrowser/
- from the console, exit pyspark with exit() and download the file with
```
hdfs dfs -get /user/username/script.py
```
- run your script with  spark-submit script.py