# Datos MotoGP Quatar 2014

In [2]:
import findspark
findspark.init("/home/jfelipe/Development/spark/spark-2.0.2-bin-hadoop2.7/")

import pyspark
from pyspark.sql import SparkSession
spark = (SparkSession.builder
    .master("local[*]")
    .config("spark.driver.cores", 1)
    .appName("understanding_sparksession")
    .getOrCreate() )
sc = spark.sparkContext
print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7f6db45b91d0>
<pyspark.context.SparkContext object at 0x7f6dd41cfb38>


## Lectura de eventos

En primer lugar, definimos los tipos de datos específicos para cada campo con un esquema personalizado.

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
customSchema = StructType([StructField("Id", LongType(), True),
                           StructField("Parent_sys_id", StringType(), True),
                           StructField("Source", StringType(), True),
                           StructField("Mentions", StringType(), True),
                           StructField("Target", StringType(), True),
                           StructField("Name_source", StringType(), True),
                           StructField("Body", StringType(), True),
                           StructField("Pub_date", TimestampType(), True),
                           StructField("URLs", StringType(), True),
                           StructField("Tipe_action", StringType(), True),
                           StructField("Link", StringType(), True),
                           StructField("Has_link", ByteType(), True),
                           StructField("Has_picture", ByteType(), True),
                           StructField("Website", StringType(), True),
                           StructField("Country", StringType(), True),
                           StructField("Activity", LongType(), True),
                           StructField("Followers", LongType(), True),
                           StructField("Following", LongType(), True),
                           StructField("Location", StringType(), True)
                          ])

A continuación, usamos el esquema personalizado para la lectura de los datos. Hay que tener en cuenta que puede que haya campos con datos mal formados (*malformed data lines*).

En la página de documentación del método `read.csv` (DataFrameReader csv), nos encontramos que el **modo por defecto de lectura de los datos es `PERMISSIVE`**. Este modo pone a `null` todos los campos del esquema que encuente para un campo con valor corrupto, y puede llegar a hacer que no leamos alguna línea completa del archivo. En nuestro caso particular, en el campo del texto del tweet aparecen en ciertas ocasiones el caracter `\` como parte de emoticonos textuales o por otros motivos, justo al final del string del campo. Eso puede hacer que el parser se confunda, interpretando erróneamente `\"` como un intento de escapar la comilla doble, cuando en realidad se trata del caracter `\` seguido de la comilla de cierre del campo de texto. También podemos controlar en la exportación de datos que se escape explicitamente cualquier carácter especial como `\`, pero este posible fallo nos da la oportunidad de usar un mecanismo para resolverlo.

Si cambiamos el modo de lectura a `mode="FAILFAST"`, eso hará que **el parser falle de inmediato ante una línea con campos corruptos** (respecto del esquema que hemos proporcionado). Buscando en la traza de error, veremos la línea que produjo la excepción y podemos buscarla en el fichero de datos para solucionar el error.

In [4]:
events = spark.read.csv("data/ALTO-DATABASE/DATASET-Twitter-23-26-Mar-2014-MotoGP-Qatar.csv",
                        header=True, schema=customSchema, timestampFormat="dd/MM/yyyy HH:mm")
                        #mode="FAILFAST") #

In [5]:
events.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Parent_sys_id: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Mentions: string (nullable = true)
 |-- Target: string (nullable = true)
 |-- Name_source: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Pub_date: timestamp (nullable = true)
 |-- URLs: string (nullable = true)
 |-- Tipe_action: string (nullable = true)
 |-- Link: string (nullable = true)
 |-- Has_link: byte (nullable = true)
 |-- Has_picture: byte (nullable = true)
 |-- Website: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Activity: long (nullable = true)
 |-- Followers: long (nullable = true)
 |-- Following: long (nullable = true)
 |-- Location: string (nullable = true)



## Análisis descriptivo

### Conjunto de datos y valores

In [6]:
events.count()

257680

In [7]:
events.take(5)

[Row(Id=600621, Parent_sys_id='sin padre', Source='Henrito_fx3', Mentions=None, Target=None, Name_source='Henrit?. ?', Body="- #henritofresh alex márquez (honda): 'he cometido un error de novato': el piloto español de moto3 alex márque... http://bit.ly/olxd7p", Pub_date=datetime.datetime(2014, 3, 23, 22, 32), URLs='http://bit.ly/olxd7p', Tipe_action='TW', Link='http://twitter.com/Henrito_fx3/statuses/447848348674838531', Has_link=1, Has_picture=0, Website='http://twitter.com/Henrito_fx3', Country='not public', Activity=82710, Followers=1485, Following=477, Location='???????? '),
 Row(Id=604076, Parent_sys_id='sin padre', Source='Henrito_fx3', Mentions=None, Target=None, Name_source='Henrit?. ?', Body='- #henritofresh con márquez, todo igual: los focos, puestos en él desde que debutara en el mundial en 2013, vo... http://bit.ly/1deycvz', Pub_date=datetime.datetime(2014, 3, 23, 22, 48), URLs='http://bit.ly/1deycvz', Tipe_action='TW', Link='http://twitter.com/Henrito_fx3/statuses/44785235

### Resumen de datos (DataFrames)

***Timestamp* y contenido del último tweet**

In [99]:
max_ts = events.agg(max("Pub_date")).collect()[0][0]
(events.select("Pub_date","Body")
 .filter(events.Pub_date == max_ts)
 .show())

+--------------------+--------------------+
|            Pub_date|                Body|
+--------------------+--------------------+
|2014-03-26 14:15:...|@marcmarquez93 go...|
|2014-03-26 14:15:...|@marcmarquez93 si...|
|2014-03-26 14:15:...|rt @_supportvale4...|
|2014-03-26 14:15:...|rt @jessansan: es...|
|2014-03-26 14:15:...|rt @marcmarquez93...|
|2014-03-26 14:15:...|rt @marcmarquez93...|
|2014-03-26 14:15:...|rt @marcmarquez93...|
|2014-03-26 14:15:...|rt @marcmarquez93...|
|2014-03-26 14:15:...|rt @marcmarquezte...|
|2014-03-26 14:15:...|rt @marcmarquezte...|
|2014-03-26 14:15:...|rt @motogp: rt @m...|
|2014-03-26 14:15:...|rt @motogp: so wh...|
|2014-03-26 14:15:...|rt @motogp: so wh...|
|2014-03-26 14:15:...|rt @motogp: so wh...|
|2014-03-26 14:15:...|rt @motogp: so wh...|
|2014-03-26 14:15:...|rt @motorpasion: ...|
|2014-03-26 14:15:...|rt @scottredding4...|
+--------------------+--------------------+



**Usuario con más *followers***

In [20]:
max_followers = events.agg(max("Followers")).collect()[0][0]
(events.select("Source", "Followers")
 .filter(events.Followers == max_followers)
 .distinct().show())

+------------+---------+
|      Source|Followers|
+------------+---------+
|justinbieber| 51669863|
+------------+---------+



**5 países con más tweets**

In [92]:
(events.filter(events.Country != "not public")
 .groupBy("Country")
 .agg(count("Id").alias("tweets"))
 .orderBy("tweets", ascending=False)
 .limit(5).show())

+-------+------+
|Country|tweets|
+-------+------+
|     es|172577|
|     us| 12722|
|     gb| 12588|
|     id|  8725|
|     it|  1843|
+-------+------+



**10 usuarios con más tweets**

In [97]:
(events
 .groupBy("Source")
 .agg(count("Id").alias("tweets"))
 .orderBy("tweets", ascending=False)
 .limit(10).show())

+------------+------+
|      Source|tweets|
+------------+------+
|  m_azharaji|   486|
|  twitMOTOGP|   401|
|   johnbokke|   297|
|qatarflights|   283|
|  box_repsol|   267|
| yolandaa_95|   185|
| AlessiaPont|   182|
| motomatters|   169|
|  MM93Lovers|   169|
|  Sonic_Moto|   165|
+------------+------+



### Estadísticas temporales (DataFrames)

**Número de tweets por día**

In [88]:
(events.groupBy(dayofmonth("Pub_date").alias("day_month"))
 .agg(count("Id").alias("tweets"))
 .orderBy("day_month").show())

+---------+------+
|day_month|tweets|
+---------+------+
|       23|154395|
|       24| 67945|
|       25| 20557|
|       26| 14783|
+---------+------+



**Número de tweets por día y hora**

In [31]:
(events.select("Id", "Pub_date")
 .groupBy(dayofmonth("Pub_date").alias("day_month"),
                hour("Pub_date").alias("hour"))
 .agg(count("Id").alias("tweets"))
 .orderBy("day_month", "hour").show())

+---------+----+------+
|day_month|hour|tweets|
+---------+----+------+
|       23|   0|  1628|
|       23|   1|  1216|
|       23|   2|   960|
|       23|   3|   751|
|       23|   4|   595|
|       23|   5|   427|
|       23|   6|   503|
|       23|   7|   455|
|       23|   8|   971|
|       23|   9|  1355|
|       23|  10|  2080|
|       23|  11|  4733|
|       23|  12|  5194|
|       23|  13|  3745|
|       23|  14|  3436|
|       23|  15|  3645|
|       23|  16|  6977|
|       23|  17| 14959|
|       23|  18|  7289|
|       23|  19| 11940|
+---------+----+------+
only showing top 20 rows



**Número de tweets por día y hora, a partir de 2014-03-24 05:00**

In [36]:
(events.select("Id", "Pub_date")
 .filter(( (dayofmonth("Pub_date") == 24) & (hour("Pub_date") >= 5)) | (dayofmonth("Pub_date") >= 25) )
 .groupBy(dayofmonth("Pub_date").alias("day_month"),
                hour("Pub_date").alias("hour"))
 .agg(count("Id").alias("tweets"))
 .orderBy("day_month", "hour").show(30) )

+---------+----+------+
|day_month|hour|tweets|
+---------+----+------+
|       24|   5|   970|
|       24|   6|  1162|
|       24|   7|  1831|
|       24|   8|  1852|
|       24|   9|  1774|
|       24|  10|  2415|
|       24|  11|  3376|
|       24|  12|  2590|
|       24|  13|  5332|
|       24|  14|  3984|
|       24|  15|  2903|
|       24|  16|  2277|
|       24|  17|  2678|
|       24|  18|  1830|
|       24|  19|  1597|
|       24|  20|  1746|
|       24|  21|  2303|
|       24|  22|  1679|
|       24|  23|  1124|
|       25|   0|  1002|
|       25|   1|   582|
|       25|   2|   382|
|       25|   3|   369|
|       25|   4|   348|
|       25|   5|   340|
|       25|   6|   385|
|       25|   7|   526|
|       25|   8|   560|
|       25|   9|   613|
|       25|  10|   941|
+---------+----+------+
only showing top 30 rows



### Resumen de datos (SQL)

In [8]:
events.createOrReplaceTempView("events")

***Timestamp* y contenido del último tweet**

In [12]:
spark.sql("""
SELECT Pub_date, Body
FROM events
WHERE Pub_date = (SELECT MAX(Pub_date) from events)
LIMIT 5
""").show()

+--------------------+--------------------+
|            Pub_date|                Body|
+--------------------+--------------------+
|2014-03-26 14:15:...|@marcmarquez93 go...|
|2014-03-26 14:15:...|@marcmarquez93 si...|
|2014-03-26 14:15:...|rt @_supportvale4...|
|2014-03-26 14:15:...|rt @jessansan: es...|
|2014-03-26 14:15:...|rt @marcmarquez93...|
+--------------------+--------------------+



**Usuario con más *followers***

In [86]:
spark.sql("""
SELECT DISTINCT(Source), Followers
FROM events
WHERE Followers = (SELECT MAX(Followers) FROM events)
""").show()

+------------+---------+
|      Source|Followers|
+------------+---------+
|justinbieber| 51669863|
+------------+---------+



**5 países con más tweets**

In [94]:
spark.sql("""
SELECT Country, COUNT(Id) AS `tweets`
FROM events
WHERE Country != "not public"
GROUP BY Country
ORDER BY tweets DESC
LIMIT 5
""").show()

+-------+------+
|Country|tweets|
+-------+------+
|     es|172577|
|     us| 12722|
|     gb| 12588|
|     id|  8725|
|     it|  1843|
+-------+------+



**10 usuarios con más tweets**

In [95]:
spark.sql("""
SELECT Source, COUNT(Id) AS `tweets`
FROM events
GROUP BY Source
ORDER BY tweets DESC
LIMIT 10
""").show()

+------------+------+
|      Source|tweets|
+------------+------+
|  m_azharaji|   486|
|  twitMOTOGP|   401|
|   johnbokke|   297|
|qatarflights|   283|
|  box_repsol|   267|
| yolandaa_95|   185|
| AlessiaPont|   182|
|  MM93Lovers|   169|
| motomatters|   169|
|  Sonic_Moto|   165|
+------------+------+



### Estadísticas temporales (SQL)

**Número de tweets por día**

In [27]:
spark.sql("""
SELECT DAYOFMONTH(Pub_date) AS `day_month`,
COUNT(id) AS `tweets`
FROM events
GROUP BY DAYOFMONTH(Pub_date)
ORDER BY DAYOFMONTH(Pub_date)
""").show()

+---------+------+
|day_month|tweets|
+---------+------+
|       23|154395|
|       24| 67945|
|       25| 20557|
|       26| 14783|
+---------+------+



**Número de tweets por día y hora**

In [26]:
spark.sql("""
SELECT DAYOFMONTH(Pub_date) AS `day_month`, HOUR(Pub_date) as `hour`,
COUNT(id) AS `tweets`
FROM events
GROUP BY DAYOFMONTH(Pub_date), HOUR(Pub_date)
ORDER BY DAYOFMONTH(Pub_date), HOUR(Pub_date)
""").show()

+---------+----+------+
|day_month|hour|tweets|
+---------+----+------+
|       23|   0|  1628|
|       23|   1|  1216|
|       23|   2|   960|
|       23|   3|   751|
|       23|   4|   595|
|       23|   5|   427|
|       23|   6|   503|
|       23|   7|   455|
|       23|   8|   971|
|       23|   9|  1355|
|       23|  10|  2080|
|       23|  11|  4733|
|       23|  12|  5194|
|       23|  13|  3745|
|       23|  14|  3436|
|       23|  15|  3645|
|       23|  16|  6977|
|       23|  17| 14959|
|       23|  18|  7289|
|       23|  19| 11940|
+---------+----+------+
only showing top 20 rows



**Número de tweets por día y hora, a partir de 2014-03-24 05:00**

In [33]:
spark.sql("""
SELECT DAYOFMONTH(Pub_date) AS `day_month`, HOUR(Pub_date) as `hour`,
COUNT(id) AS `tweets`
FROM events
WHERE ((DAYOFMONTH(Pub_date) = 24 AND HOUR(Pub_date) >= 5) OR DAYOFMONTH(Pub_date) >= 25)
GROUP BY DAYOFMONTH(Pub_date), HOUR(Pub_date)
ORDER BY DAYOFMONTH(Pub_date), HOUR(Pub_date)
""").show(30)

+---------+----+------+
|day_month|hour|tweets|
+---------+----+------+
|       24|   5|   970|
|       24|   6|  1162|
|       24|   7|  1831|
|       24|   8|  1852|
|       24|   9|  1774|
|       24|  10|  2415|
|       24|  11|  3376|
|       24|  12|  2590|
|       24|  13|  5332|
|       24|  14|  3984|
|       24|  15|  2903|
|       24|  16|  2277|
|       24|  17|  2678|
|       24|  18|  1830|
|       24|  19|  1597|
|       24|  20|  1746|
|       24|  21|  2303|
|       24|  22|  1679|
|       24|  23|  1124|
|       25|   0|  1002|
|       25|   1|   582|
|       25|   2|   382|
|       25|   3|   369|
|       25|   4|   348|
|       25|   5|   340|
|       25|   6|   385|
|       25|   7|   526|
|       25|   8|   560|
|       25|   9|   613|
|       25|  10|   941|
+---------+----+------+
only showing top 30 rows

