In [1]:
dataset = spark.read.json("/root/twitter.medium")

In [24]:
dataset.registerTempTable("tweets")

In [28]:
dataset.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true

In [19]:
#1. Tweets-oriented analyses
# - number of tweets per day, per month, per year
from datetime import datetime
from operator import add

def convert_time(field):
    date_parts = field.split()
    date = "%s/%s/%s:%s" % (date_parts[2],
                            date_parts[1],
                            date_parts[5],
                            date_parts[3])
    return datetime.strptime(date, '%d/%b/%Y:%H:%M:%S')

dataset.rdd.map(lambda x: ((convert_time(x.created_at)\
                            .strftime("%d"), 
                            convert_time(x.created_at)\
                            .strftime("%b"),
                            convert_time(x.created_at)\
                            .strftime("%Y")), 1))\
.reduceByKey(add)\
.takeOrdered(20, lambda x: -x[1])

[(('03', 'Mar', '2013'), 819),
 (('17', 'Mar', '2013'), 754),
 (('10', 'Mar', '2013'), 741),
 (('09', 'Mar', '2013'), 634),
 (('02', 'Mar', '2013'), 619),
 (('16', 'Mar', '2013'), 585),
 (('04', 'Mar', '2013'), 551),
 (('11', 'Mar', '2013'), 533),
 (('18', 'Mar', '2013'), 476),
 (('01', 'Mar', '2013'), 466),
 (('07', 'Mar', '2013'), 452),
 (('08', 'Mar', '2013'), 448),
 (('05', 'Mar', '2013'), 419),
 (('06', 'Mar', '2013'), 383),
 (('15', 'Mar', '2013'), 371),
 (('12', 'Mar', '2013'), 354),
 (('13', 'Mar', '2013'), 349),
 (('19', 'Mar', '2013'), 348),
 (('14', 'Mar', '2013'), 335),
 (('28', 'Feb', '2013'), 220)]

In [18]:
#1. Tweets-oriented analyses
# - number of tweets with interactions per day, per month, per year

from datetime import datetime
from operator import add


def convert_time(field):
    date_parts = field.split()
    date = "%s/%s/%s:%s" % (date_parts[2],
                            date_parts[1],
                            date_parts[5],
                            date_parts[3])
    return datetime.strptime(date, '%d/%b/%Y:%H:%M:%S')

def interactions(field1,field2):
    inter = field1 + field2
    return inter

dataset.select(dataset.created_at.alias("tweet"),
               interactions(dataset.retweet_count,dataset.favorite_count).alias("interactions"))\
.rdd.map(lambda x: (convert_time(x[0]).strftime("%d-%b-%Y"),x[1]))\
.reduceByKey(add).takeOrdered(20,lambda x: -x[1])

[('18-Mar-2013', 82),
 ('10-Mar-2013', 35),
 ('16-Mar-2013', 31),
 ('02-Mar-2013', 31),
 ('09-Mar-2013', 30),
 ('12-Mar-2013', 29),
 ('03-Mar-2013', 25),
 ('19-Mar-2013', 25),
 ('11-Mar-2013', 24),
 ('17-Mar-2013', 23),
 ('01-Mar-2013', 22),
 ('14-Mar-2013', 20),
 ('04-Mar-2013', 19),
 ('05-Mar-2013', 17),
 ('08-Mar-2013', 16),
 ('15-Mar-2013', 15),
 ('07-Mar-2013', 14),
 ('13-Mar-2013', 9),
 ('28-Feb-2013', 7),
 ('06-Mar-2013', 3)]

In [22]:
#2. Movies-oriented analyses
#number of tweets about every single movie, per day and per month

from datetime import datetime
from operator import add

def convert_time(field):
    date_parts = field.split()
    date = "%s/%s/%s:%s" % (date_parts[2],
                            date_parts[1],
                            date_parts[5],
                            date_parts[3])
    return datetime.strptime(date, '%d/%b/%Y:%H:%M:%S')

dataset.rdd.map(lambda x: ((x.entities.urls[0].expanded_url,
                            convert_time(x.created_at)\
                            .strftime("%d-%b")),1))\
.reduceByKey(add)\
.takeOrdered(20, lambda x: -x[1])

[((u'http://www.imdb.com/title/tt1623205', '10-Mar'), 61),
 ((u'http://www.imdb.com/title/tt1623205', '09-Mar'), 55),
 ((u'http://www.imdb.com/title/tt1024648', '03-Mar'), 43),
 ((u'http://www.imdb.com/title/tt1623205', '11-Mar'), 40),
 ((u'http://www.imdb.com/title/tt1024648', '02-Mar'), 32),
 ((u'http://www.imdb.com/title/tt1623205', '17-Mar'), 28),
 ((u'http://www.imdb.com/title/tt1024648', '01-Mar'), 26),
 ((u'http://www.imdb.com/title/tt1623205', '08-Mar'), 26),
 ((u'http://www.imdb.com/title/tt1623205', '13-Mar'), 25),
 ((u'http://www.imdb.com/title/tt1623205', '12-Mar'), 24),
 ((u'http://www.imdb.com/title/tt1623205', '16-Mar'), 24),
 ((u'http://www.imdb.com/title/tt1024648', '04-Mar'), 22),
 ((u'http://www.imdb.com/title/tt1907668', '03-Mar'), 21),
 ((u'http://www.imdb.com/title/tt1045658', '03-Mar'), 21),
 ((u'http://www.imdb.com/title/tt1024648', '09-Mar'), 21),
 ((u'http://www.imdb.com/title/tt1024648', '10-Mar'), 20),
 ((u'http://www.imdb.com/title/tt1045658', '10-Mar'), 19

In [3]:
#2. Movies-oriented analyses
# amount of intractions a movie recieves

from operator import add

def interactions(field1,field2):
    inter = field1 + field2
    return inter

tmp = dataset.select(dataset.entities.urls[0].display_url.alias("movie"),
               dataset.favorite_count.alias("favorite_count"),
               dataset.retweet_count.alias("retweet_count"))\
.rdd.map(lambda x: ((x[0]),interactions(x[1],x[2])))\
.reduceByKey(add)
tmp.takeOrdered(20, lambda x: -x[1])

[(u'imdb.com/title/tt0434139', 61),
 (u'imdb.com/title/tt1045658', 19),
 (u'imdb.com/title/tt1623205', 18),
 (u'imdb.com/title/tt1707386', 18),
 (u'imdb.com/title/tt0454876', 15),
 (u'imdb.com/title/tt1047011', 13),
 (u'imdb.com/title/tt1024648', 11),
 (u'imdb.com/title/tt1659337', 9),
 (u'imdb.com/title/tt0840361', 9),
 (u'imdb.com/title/tt1389096', 9),
 (u'imdb.com/title/tt1966604', 7),
 (u'imdb.com/title/tt0151804', 6),
 (u'imdb.com/title/tt1637725', 5),
 (u'imdb.com/title/tt1772341', 5),
 (u'imdb.com/title/tt1673434', 5),
 (u'imdb.com/title/tt1853728', 5),
 (u'imdb.com/title/tt0838283', 5),
 (u'imdb.com/title/tt0110912', 5),
 (u'imdb.com/title/tt2053463', 4),
 (u'imdb.com/title/tt1371111', 4)]

In [27]:
#2. Movies-oriented analyses
# Popularity of each movie among different language speakers:
spark.sql("select lang,entities.urls[0].expanded_url as URL, count(1) as cnt "+\
               "from tweets "+\
               "group by lang,entities.urls[0].expanded_url "+\
               "order by cnt desc").show(20, False)

+----+-----------------------------------+---+
|lang|URL                                |cnt|
+----+-----------------------------------+---+
|en  |http://www.imdb.com/title/tt1623205|356|
|en  |http://www.imdb.com/title/tt1024648|279|
|en  |http://www.imdb.com/title/tt1045658|178|
|en  |http://www.imdb.com/title/tt0454876|155|
|en  |http://www.imdb.com/title/tt1790885|127|
|en  |http://www.imdb.com/title/tt1772341|95 |
|en  |http://www.imdb.com/title/tt1907668|90 |
|en  |http://www.imdb.com/title/tt1074638|81 |
|en  |http://www.imdb.com/title/tt1659337|72 |
|sk  |http://www.imdb.com/title/tt1707386|70 |
|en  |http://www.imdb.com/title/tt1351685|70 |
|sk  |http://www.imdb.com/title/tt1853728|69 |
|en  |http://www.imdb.com/title/tt2053463|68 |
|en  |http://www.imdb.com/title/tt0903624|63 |
|en  |http://www.imdb.com/title/tt1853728|63 |
|en  |http://www.imdb.com/title/tt2023587|62 |
|en  |http://www.imdb.com/title/tt1606378|54 |
|en  |http://www.imdb.com/title/tt0443272|53 |
|en  |http://

In [55]:
#user oriented analyses
#number of followers, favourites, statuses and listings of all users.

spark.sql("select user.followers_count"+\
          ",user.favourites_count "+\
          ",user.statuses_count "+\
          ",user.screen_name "+\
          ",user.listed_count "+\
          "from tweets")\
.show()

from pyspark.sql import functions as F

dsMin = dataset.groupBy(dataset.user.screen_name.alias("user_name")).\
               agg(F.min(dataset.created_at).alias("created_at"))
    
dsMax = dataset.groupBy(dataset.user.screen_name.alias("user_name")).\
           agg(F.max(dataset.created_at).alias("created_at"))
    
dsUser = dataset.select(dataset.user.screen_name.alias("user_name"),\
                 dataset.user.followers_count,\
                 dataset.user.favourites_count,\
                 dataset.user.statuses_count,\
                 dataset.user.listed_count,\
                 dataset.created_at.alias("created_at"))

dsUserMin = dsMin.join(dsMax,((dsMin.user_name == dsMax.user_name)\
                                          & (dsMin.created_at == dsMax.created_at)),'inner')\
.drop(dsMin.user_name).drop(dsMin.created_at)
    
dsUserMax = dsUser.join(dsMax,((dsUser.user_name == dsMax.user_name)\
                                     & (dsUser.created_at == dsMax.created_at)),'inner')\
.drop(dsUser.user_name).drop(dsUser.created_at)

dsUserMin.join(dsUserMax,dsUserMin.user_name == dsUserMax.user_name,'inner').show(20,False)

+---------------+----------------+--------------+---------------+------------+
|followers_count|favourites_count|statuses_count|    screen_name|listed_count|
+---------------+----------------+--------------+---------------+------------+
|            114|             679|         47133|     Nat_ta_gun|           2|
|            151|             121|          9281|   Carterwade99|           0|
|            201|             333|          1410|       MircheBg|           6|
|             81|              23|           853|     zoltanmora|           1|
|             81|              23|           853|     zoltanmora|           1|
|             75|               0|           655|        yenda_m|           1|
|             81|              23|           853|     zoltanmora|           1|
|            137|              76|          8231|        BaderJr|           0|
|            215|              15|          1091|      paulpasia|           4|
|            130|             135|         22372|   

In [50]:

dsMin = dataset.groupBy(dataset.user.screen_name).agg(F.min(dataset.created_at)).show()
dsMax = dataset.groupBy(dataset.user.screen_name).agg(F.max(dataset.created_at)).show()

+-------------------+--------------------+
|user['screen_name']|     min(created_at)|
+-------------------+--------------------+
|    Charlie_Drummer|Sat Mar 02 00:12:...|
|         CoOLKuWaiT|Fri Mar 08 22:00:...|
|        HasanHYasar|Sat Mar 16 22:07:...|
|          JacksonSR|Mon Mar 04 21:53:...|
|            MWPHadi|Tue Mar 12 10:30:...|
|          Mr_Cheech|Mon Mar 11 04:51:...|
|          NisseBabe|Fri Mar 01 01:06:...|
|           Zandruch|Sun Mar 03 04:03:...|
|          cashkeith|Sat Mar 02 01:40:...|
|      damounnassehi|Sat Mar 16 22:40:...|
|            daslive|Wed Mar 20 03:20:...|
|            joebh2k|Sun Mar 17 14:59:...|
|          jonesn207|Wed Mar 06 02:33:...|
|       miquelrenoir|Tue Mar 12 14:01:...|
|          oisinsdad|Mon Mar 18 19:43:...|
|           rutter78|Sat Mar 09 22:19:...|
|          saidozcan|Sat Mar 09 15:43:...|
|          vdswouter|Sat Mar 02 01:32:...|
|       BriMontinaro|Fri Mar 08 16:57:...|
|         CagriOksuz|Sat Mar 09 14:33:...|
+----------

In [48]:
dsUser = dataset.select(dataset.user.screen_name,\
                        dataset.user.followers_count,\
                        dataset.user.favourites_count,\
                        dataset.user.listed_count,\
                        dataset.user.statuses_count,\
                        dataset.created_at).show()

In [49]:
dsMinJoin = dsMin.join(dsMax,((dsMin.user.screen_name == dsMax.user.screen_name)\
                               & (dsMin.created_at == dsMax.created_at)),'inner')\
                  .drop(dsMin.user.screen_name).drop(dsMin.created_at).show()
    

AttributeError: 'DataFrame' object has no attribute 'user'

In [56]:
#top 20 popular movies

from operator import add

def interactions(field1,field2):
    inter = field1 + field2
    return inter

tmp = dataset.select(dataset.entities.urls[0].display_url.alias("movie"),
               dataset.favorite_count.alias("favorite_count"),
               dataset.retweet_count.alias("retweet_count"))\
.rdd.map(lambda x: ((x[0]),interactions(x[1],x[2])))\
.reduceByKey(add)
tmp.takeOrdered(20, lambda x: -x[1])

[(u'imdb.com/title/tt0434139', 61),
 (u'imdb.com/title/tt1045658', 19),
 (u'imdb.com/title/tt1623205', 18),
 (u'imdb.com/title/tt1707386', 18),
 (u'imdb.com/title/tt0454876', 15),
 (u'imdb.com/title/tt1047011', 13),
 (u'imdb.com/title/tt1024648', 11),
 (u'imdb.com/title/tt1659337', 9),
 (u'imdb.com/title/tt0840361', 9),
 (u'imdb.com/title/tt1389096', 9),
 (u'imdb.com/title/tt1966604', 7),
 (u'imdb.com/title/tt0151804', 6),
 (u'imdb.com/title/tt1637725', 5),
 (u'imdb.com/title/tt1772341', 5),
 (u'imdb.com/title/tt1673434', 5),
 (u'imdb.com/title/tt1853728', 5),
 (u'imdb.com/title/tt0838283', 5),
 (u'imdb.com/title/tt0110912', 5),
 (u'imdb.com/title/tt2053463', 4),
 (u'imdb.com/title/tt1371111', 4)]

In [57]:
#month that had most interactions
from datetime import datetime
from operator import add


def convert_time(field):
    date_parts = field.split()
    date = "%s/%s/%s:%s" % (date_parts[2],
                            date_parts[1],
                            date_parts[5],
                            date_parts[3])
    return datetime.strptime(date, '%d/%b/%Y:%H:%M:%S')

def interactions(field1,field2):
    inter = field1 + field2
    return inter

dataset.select(dataset.created_at.alias("tweet"),
               interactions(dataset.retweet_count,dataset.favorite_count).alias("interactions"))\
.rdd.map(lambda x: (convert_time(x[0]).strftime("%b"),x[1]))\
.reduceByKey(add).takeOrdered(20,lambda x: -x[1])

[('Mar', 472), ('Feb', 7)]

In [60]:
#most popular movie in spanish speakers

spark.sql("select lang,entities.urls[0].expanded_url as URL, count(1) as cnt "+\
               "from tweets "+\
               "where lang = 'es' "+\
               "group by lang,entities.urls[0].expanded_url "+\
               "order by cnt desc").show(1,False)

+----+-----------------------------------+---+
|lang|URL                                |cnt|
+----+-----------------------------------+---+
|es  |http://www.imdb.com/title/tt1623205|4  |
+----+-----------------------------------+---+
only showing top 1 row



In [64]:
#What are the users with the most changes in numbers of followers between frst
#and the last tweet?

from pyspark.sql import functions as F
dsMin = dataset.groupBy(dataset.user.screen_name.alias("user")).\
               agg(F.min(dataset.created_at).alias("min_created_at"))
    
dsMax = dataset.groupBy(dataset.user.screen_name.alias("user")).\
           agg(F.max(dataset.created_at).alias("max_created_at"))
    
dsUser = dataset.select(dataset.user.screen_name.alias("user"),\
                 dataset.user.followers_count.alias("followers_count"),\
                 dataset.created_at.alias("created_at"))

dsUserMin = dsMin.join(dsUser,((dsMin.user == dsUser.user) & (dsMin.min_created_at == dsUser.created_at)),'inner')\
    .drop(dsMin.user).drop(dsMin.min_created_at)
    
dsUserMax = dsMax.join(dsUser,((dsMax.user == dsUser.user) & (dsMax.max_created_at == dsUser.created_at)),'inner')\
    .drop(dsMax.user).drop(dsMax.max_created_at)
    
dsUserMax.registerTempTable("max")
dsUserMin.registerTempTable("min")

spark.sql("select max.user,(max.followers_count-min.followers_count)\
           from max \
            inner join min \
                  on max.user == min.user \
              group by max.user,(max.followers_count-min.followers_count)\
          order by (max.followers_count-min.followers_count) desc").show()

+---------------+-----------------------------------+
|           user|(followers_count - followers_count)|
+---------------+-----------------------------------+
|  DevilsBallBag|                                 16|
|   TheArsenal77|                                  9|
|     carlosshue|                                  7|
|MaxLikesNOODLES|                                  5|
| stephenjcleary|                                  5|
| worldperformer|                                  4|
|     rhcp011235|                                  4|
|    VindiTweets|                                  4|
|     Jerryslife|                                  4|
|     dougjumper|                                  4|
|       ohsotony|                                  4|
|official_lostie|                                  3|
|       TheBimal|                                  3|
|   REDDEVILT101|                                  3|
|    Wigrens2002|                                  3|
|    alanocturna|           