In [1]:
from pyspark.sql import Row
import datetime
import re

month_map = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
    'Aug':8,  'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12}

def parse_apache_time(s):
    return datetime.datetime(int(s[7:11]),
                             month_map[s[3:6]],
                             int(s[0:2]),
                             int(s[12:14]),
                             int(s[15:17]),
                             int(s[18:20]))

In [2]:
APACHE_ACCESS_LOG_PATTERN =\
    '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\d+|-)'

def parseApacheLogLine(logline):
    
    match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)
    if match is None:
        return (logline, 0)
    size_field = match.group(9)
    if size_field == '-':
        size = int(0)
    else:
        size = int(match.group(9))
    return (Row(
        ip                = match.group(1),
        cliente           = match.group(2),
        id_usuario        = match.group(3),
        fecha_hora        = parse_apache_time(match.group(4)),
        metodo            = match.group(5),
        url               = match.group(6),
        protocolo         = match.group(7),
        cod_respuesta     = int(match.group(8)),
        tamano_contenido  = size
    ), 1)

In [3]:
from pyspark import SparkContext
sc = SparkContext()

In [4]:
def parseLogs():
    """ Read and parse log file """
    parsed_logs = (sc
                   .textFile('access_log_Aug95')
                   .map(parseApacheLogLine)
                   .cache())

    access_logs = (parsed_logs
                   .filter(lambda s: s[1] == 1)
                   .map(lambda s: s[0])
                   .cache())

    failed_logs = (parsed_logs
                   .filter(lambda s: s[1] == 0)
                   .map(lambda s: s[0])
                   .cache())  
    
    failed_logs_count = failed_logs.count()

    print ('Read %d lines, successfully parsed %d lines, failed to parse %d lines' % 
           (parsed_logs.count(), access_logs.count(), failed_logs.count()))
    
    parsed_logs.unpersist()
    failed_logs.unpersist()
    
    return access_logs

In [5]:
from pyspark.sql import SparkSession
#create a SparkSession
spark = (SparkSession
    .builder
    .appName("Un poquito de Spark")
    .getOrCreate())

In [6]:
acces_log = parseLogs()

df = spark.createDataFrame(acces_log).cache()

acces_log.unpersist()

Read 1569898 lines, successfully parsed 1569003 lines, failed to parse 895 lines


PythonRDD[3] at RDD at PythonRDD.scala:53

In [7]:
df.printSchema()

root
 |-- ip: string (nullable = true)
 |-- cliente: string (nullable = true)
 |-- id_usuario: string (nullable = true)
 |-- fecha_hora: timestamp (nullable = true)
 |-- metodo: string (nullable = true)
 |-- url: string (nullable = true)
 |-- protocolo: string (nullable = true)
 |-- cod_respuesta: long (nullable = true)
 |-- tamano_contenido: long (nullable = true)



#### Guardaremos nuestro nuevo DataFrame ya estructurado en formato parquet. Y de este leeremos para realizar nuestro análisis.

In [8]:
(df.write
  .mode('overwrite') # or append  
  .format('parquet') # this is optional, parquet is default
  .option('path', "Resultados/df_parquet")
  .save())

In [9]:
(spark
   .read.parquet("Resultados/df_parquet")
   .createOrReplaceTempView("A"))

# Analisis

### ¿Cuáles son los distintos protocolos web utilizados? Agrúpalos.

#### PySpark

In [10]:
from pyspark.sql import functions as F

(df
 .groupBy("protocolo")
 .count()
 .orderBy("count",ascending = False)
 .where(F.col("protocolo") != "")
 .show())

+---------+-------+
|protocolo|  count|
+---------+-------+
| HTTP/1.0|1566969|
|HTTP/V1.0|    163|
|        a|      1|
+---------+-------+



In [11]:
df.where(F.col("protocolo") == "a").toPandas()

Unnamed: 0,ip,cliente,id_usuario,fecha_hora,metodo,url,protocolo,cod_respuesta,tamano_contenido
0,dsl.rhilinet.gov,-,-,1995-08-16 11:17:54,GET,"/software/winvn/winvn.html>WinVN</a>,",a,404,0


#### SQL

In [12]:
spark.sql("""select protocolo,
                    count(protocolo) as n_registros
             from A 
             where protocolo != ''
             group by protocolo
             order by n_registros desc
             
             """).show()

+---------+-----------+
|protocolo|n_registros|
+---------+-----------+
| HTTP/1.0|    1566969|
|HTTP/V1.0|        163|
|        a|          1|
+---------+-----------+



In [13]:
spark.sql("""select * 
             from A 
             where protocolo == 'a' 
             """).show()

+----------------+-------+----------+-------------------+------+--------------------+---------+-------------+----------------+
|              ip|cliente|id_usuario|         fecha_hora|metodo|                 url|protocolo|cod_respuesta|tamano_contenido|
+----------------+-------+----------+-------------------+------+--------------------+---------+-------------+----------------+
|dsl.rhilinet.gov|      -|         -|1995-08-16 11:17:54|   GET|/software/winvn/w...|        a|          404|               0|
+----------------+-------+----------+-------------------+------+--------------------+---------+-------------+----------------+



### ¿Cuáles son los códigos de estado más comunes en la web? Agrúpalos y ordénalos para ver cuál es el más común.

#### PySpark

In [14]:
(df
 .groupBy("cod_respuesta")
 .count()
 .orderBy("count",ascending = False)
 .toPandas()
)

Unnamed: 0,cod_respuesta,count
0,200,1398207
1,304,134138
2,302,26437
3,404,10020
4,403,171
5,501,27
6,500,3


#### SQL

In [15]:
spark.sql("""select cod_respuesta,
                    count(cod_respuesta) as n_registros
             from A
             group by cod_respuesta
             order By n_registros desc""").show()

+-------------+-----------+
|cod_respuesta|n_registros|
+-------------+-----------+
|          200|    1398207|
|          304|     134138|
|          302|      26437|
|          404|      10020|
|          403|        171|
|          501|         27|
|          500|          3|
+-------------+-----------+



### ¿Y los métodos de petición (verbos) más utilizados?

#### PySpark

In [16]:
(df
 .groupBy("metodo")
 .count()
 .orderBy("metodo")
 .toPandas()
)

Unnamed: 0,metodo,count
0,GET,1564927
1,HEAD,3965
2,POST,111


#### SQL

In [17]:
spark.sql("""select metodo,
                    count(metodo) as n_registros
             from A
             group by metodo
             order By n_registros desc""").show()

+------+-----------+
|metodo|n_registros|
+------+-----------+
|   GET|    1564927|
|  HEAD|       3965|
|  POST|        111|
+------+-----------+



### ¿Qué recurso tuvo la mayor transferencia de bytes de la página web?

#### PySpark

In [18]:
(df
 .groupBy("url")
 .agg({'url': 'count',
      'tamano_contenido': 'sum'})
 .orderBy("sum(tamano_contenido)"
                  ,ascending = False)
 .select("url",
         F.col("count(url)").alias("n_registros"),
         F.col("sum(tamano_contenido)").alias("total_transferencia"))
 .toPandas()
)

Unnamed: 0,url,n_registros,total_transferencia
0,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,2158,1639380464
1,/shuttle/missions/sts-69/count69.gif,24381,1005927794
2,/shuttle/missions/sts-69/movies/sts-69-rollbac...,729,512058235
3,/shuttle/technology/sts-newsref/stsref-toc.html,6516,493211198
4,/shuttle/missions/sts-69/movies/ws-animation-d...,1392,464050354
...,...,...,...
15332,/shuttle/resources/orbiters/mpta-098-logo.gif,1,0
15333,/history/apollo/sa-6/docs/,3,0
15334,/apollo13,1,0
15335,/history/apollo-13/docs/,1,0


#### SQL

In [19]:
spark.sql("""select url,
                    count(*) as n_registros,
                    sum(tamano_contenido) as total_transferencia
             from A
             group by url
             order By total_transferencia desc""").toPandas()

Unnamed: 0,url,n_registros,total_transferencia
0,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,2158,1639380464
1,/shuttle/missions/sts-69/count69.gif,24381,1005927794
2,/shuttle/missions/sts-69/movies/sts-69-rollbac...,729,512058235
3,/shuttle/technology/sts-newsref/stsref-toc.html,6516,493211198
4,/shuttle/missions/sts-69/movies/ws-animation-d...,1392,464050354
...,...,...,...
15332,/shuttle/technology/images/apu_mods-small.gif,3,0
15333,/SIUE/at_work.gif,1,0
15334,/apollo13,1,0
15335,/history/apollo-13/docs/,1,0


In [20]:
spark.sql("""select url,
                    tamano_contenido 
             from A 
             where url == '/shuttle/missions/sts-71/movies/sts-71-launch.mpg'""").toPandas()

Unnamed: 0,url,tamano_contenido
0,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554
1,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,147456
2,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554
3,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,81920
4,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554
...,...,...
2153,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554
2154,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554
2155,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554
2156,/shuttle/missions/sts-71/movies/sts-71-launch.mpg,1121554


### Además, queremos saber que recurso de nuestra web es el que más tráfico recibe. Es decir, el recurso con más registros en nuestro log.

#### PySpark

In [21]:
(df.
 groupBy("url")
 .count().orderBy("count"
                  ,ascending = False)
 .select("url",F.col("count").alias("Nº de Registros"))
 .show(1,truncate = False)
)

+--------------------------+---------------+
|url                       |Nº de Registros|
+--------------------------+---------------+
|/images/NASA-logosmall.gif|97384          |
+--------------------------+---------------+
only showing top 1 row



#### SQL

In [22]:
spark.sql("""select url,
                    count(*) as n_registros
             from A
             group by url
             order By n_registros desc
             limit 1""").toPandas()

Unnamed: 0,url,n_registros
0,/images/NASA-logosmall.gif,97384


### ¿Qué días la web recibió más tráfico?

#### PySpark

In [23]:
from pyspark.sql.functions import to_date
df1 = (df.withColumn("fecha",to_date(F.col("fecha_hora")))
      .withColumn("Hora",F.substring("fecha_hora",12,15)))
df1.cache()

DataFrame[ip: string, cliente: string, id_usuario: string, fecha_hora: timestamp, metodo: string, url: string, protocolo: string, cod_respuesta: bigint, tamano_contenido: bigint, fecha: date, Hora: string]

In [24]:
(df1.write
  .mode('overwrite') # or append  
  .format('parquet') # this is optional, parquet is default
  .option('path', "Resultados/df1_parquet")
  .save())

In [25]:
df1.groupby("fecha").count().orderBy("count",ascending = False).show(5)

+----------+-----+
|     fecha|count|
+----------+-----+
|1995-08-31|89825|
|1995-08-30|80299|
|1995-08-29|67984|
|1995-08-11|61242|
|1995-08-10|61237|
+----------+-----+
only showing top 5 rows



#### SQL

In [26]:
(spark
   .read.parquet("Resultados/df1_parquet")
   .createOrReplaceTempView("B"))

In [27]:
spark.sql("""select fecha,
                    count("fecha") as count
             from B
             group by fecha
             order By count desc
             limit 5
             """).toPandas()

Unnamed: 0,fecha,count
0,1995-08-31,89825
1,1995-08-30,80299
2,1995-08-29,67984
3,1995-08-11,61242
4,1995-08-10,61237


### ¿Cuáles son los hosts son los más frecuentes?

#### PySpark

In [28]:
(df
 .groupby("ip")
 .count()
 .orderBy("count",ascending=False)
 .show(5))

+--------------------+-----+
|                  ip|count|
+--------------------+-----+
|  edams.ksc.nasa.gov| 6530|
|piweba4y.prodigy.com| 4846|
|        163.206.89.4| 4791|
|piweba5y.prodigy.com| 4607|
|piweba3y.prodigy.com| 4416|
+--------------------+-----+
only showing top 5 rows



#### SQL

In [29]:
spark.sql("""select ip,
                    count(*) as count
             from B
             group by ip
             order by count desc
             limit 5""").toPandas()

Unnamed: 0,ip,count
0,edams.ksc.nasa.gov,6530
1,piweba4y.prodigy.com,4846
2,163.206.89.4,4791
3,piweba5y.prodigy.com,4607
4,piweba3y.prodigy.com,4416


### ¿A qué horas se produce el mayor número de tráfico en la web?

#### PySpark

In [30]:
df1.groupby("Hora").count().orderBy("count",ascending = False).show(5)

+--------+-----+
|    Hora|count|
+--------+-----+
|14:28:00|   63|
|13:34:33|   60|
|15:30:46|   59|
|15:26:48|   57|
|15:48:25|   56|
+--------+-----+
only showing top 5 rows



### ¿Cuál es el número de errores 404 que ha habido cada día?

#### PySpark

In [31]:
(df1.where(F.col("cod_respuesta") == "404")
 .groupby("fecha")
 .count()
 .orderBy("count",ascending = False)
 .show(5))

+----------+-----+
|     fecha|count|
+----------+-----+
|1995-08-30|  567|
|1995-08-07|  532|
|1995-08-31|  526|
|1995-08-29|  420|
|1995-08-24|  420|
+----------+-----+
only showing top 5 rows



In [32]:
(df1.where((F.col("cod_respuesta") == "404") & (F.col("fecha") == '1995-08-01')).count())

243

In [33]:
(df1.where((F.col("cod_respuesta") == "404") & (F.col("fecha") == '1995-08-30')).count())

567

#### SQL

In [34]:
spark.sql("""select fecha,
                    count(*) as count
             from B
             where cod_respuesta == '404'
             group by fecha
             order By count desc
             limit 5
             """).toPandas()

Unnamed: 0,fecha,count
0,1995-08-30,567
1,1995-08-07,532
2,1995-08-31,526
3,1995-08-29,420
4,1995-08-24,420


In [35]:
df.unpersist()
df1.unpersist()

DataFrame[ip: string, cliente: string, id_usuario: string, fecha_hora: timestamp, metodo: string, url: string, protocolo: string, cod_respuesta: bigint, tamano_contenido: bigint, fecha: date, Hora: string]