In [8]:
from pyspark.sql import functions as F

In [9]:
dataset = spark.read.json("twitter.medium")

A function to parse date:

In [10]:
from datetime import datetime
from operator import add

def parse_date(date_field):
    date_parts = date_field.split()
    date_str = "%s/%s/%s:%s" % (date_parts[2],
                                date_parts[1],
                                date_parts[5],
                                date_parts[3])
    return datetime.strptime(date_str, '%d/%b/%Y:%H:%M:%S')

Register temp table for SQL:

In [11]:
dataset.registerTempTable("tweets")

# Tweets-oriented

Tweets per day, month, year

In [14]:

tweets_per_day = dataset.rdd.map(lambda x: (parse_date(x.created_at).strftime("%Y-%m-%d"), 1)).reduceByKey(add)
tweets_per_day.take(5)


[('2013-03-07', 452),
 ('2013-03-10', 741),
 ('2013-03-05', 419),
 ('2013-03-03', 819),
 ('2013-03-16', 585)]

Interactions per day, month, year

In [15]:
interactions_per_day = dataset.rdd.map(lambda x: (parse_date(x.user.created_at).strftime("%Y-%m-%d"), x.favorite_count + x.retweet_count)).reduceByKey(add)
interactions_per_day.take(5)

[('2011-01-14', 0),
 ('2008-10-21', 0),
 ('2008-11-28', 0),
 ('2008-08-15', 0),
 ('2010-06-16', 0)]

# Movies-oriented

Tweets per movie per day

In [None]:
movies_tweets_per_day = dataset.rdd.map(lambda x: ((x.entities.urls[0].display_url, parse_date(x.user.created_at).strftime("%Y-%m-%d")), 1)).reduceByKey(add)

Engagement per movie

In [None]:
movies_engagement = spark.sql("select entities.urls[0].display_url as movie, "+\
               "sum(favorite_count) + sum(retweet_count) as engagement "
               "from tweets "+\
               "group by entities.urls[0].display_url")

Popularity per movie per language:

In [None]:
movies_language_pop = dataset.select(F.explode("entities.urls").alias("col"), F.col("lang").alias("language"))\
    .select(F.col("col.display_url").alias("movie"), "language")\
    .groupBy("movie", "language")\
    .count()

# Users-oriented

Number of followers, favourites, statuses and listings per user, oldest and newest data:

In [None]:
user_stats = spark.sql("select x.screen_name, " \
          + "t1.user.followers_count as old_followers, t1.user.favourites_count as old_favourites, t1.user.statuses_count as old_statuses, t1.user.listed_count as old_listed, " \
          + "t2.user.followers_count as new_followers, t2.user.favourites_count as new_favourites, t2.user.statuses_count as new_statuses, t2.user.listed_count as new_listed " \
          + "from (select user.screen_name, max(created_at) as new, min(created_at) as old from tweets group by user.screen_name) x " 
          + "join tweets t1 on t1.user.screen_name = x.screen_name and t1.created_at = x.old " \
          + "join tweets t2 on t2.user.screen_name = x.screen_name and t2.created_at = x.new")
