In [54]:
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType};

def parseApacheLogLine(logline: String): Row = {

    val pattern = raw"""^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\d+|-)""".r
        
    logline match {
      case pattern(first, second, third, fourth, fith, sixed, seventh, eigth, nined) => 
            var size = 0
            if (nined == "-") {size = 0} else {size = nined.toInt}
            return(Row(first, second, third, fourth, fith, sixed, seventh, eigth.toInt, size))
      case _ => return(Row())
    }
}

import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
parseApacheLogLine: (logline: String)org.apache.spark.sql.Row


In [55]:
var logs = sc.textFile("access_log_Aug95")
                .map(parseApacheLogLine).filter(_ .length > 0).cache()                  

logs: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[93] at filter at <console>:68


In [58]:
val schema = StructType(
  List(
    StructField("ip", StringType, nullable=false),
    StructField("cliente", StringType, nullable=false),
    StructField("id_usuario", StringType, nullable=false),
    StructField("fecha_hora", StringType, nullable=false),
    StructField("metodo", StringType, nullable=false),
    StructField("url", StringType, nullable=false),
    StructField("protocolo", StringType, nullable=false),
    StructField("cod_respuesta", IntegerType, nullable=false),
    StructField("tamano_contenido", IntegerType, nullable=false)      
  )
)

val df = spark.createDataFrame(logs,schema)

df.printSchema()

df.show(2)

root
 |-- ip: string (nullable = false)
 |-- cliente: string (nullable = false)
 |-- id_usuario: string (nullable = false)
 |-- fecha_hora: string (nullable = false)
 |-- metodo: string (nullable = false)
 |-- url: string (nullable = false)
 |-- protocolo: string (nullable = false)
 |-- cod_respuesta: integer (nullable = false)
 |-- tamano_contenido: integer (nullable = false)

+-----------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+
|               ip|cliente|id_usuario|          fecha_hora|metodo|                 url|protocolo|cod_respuesta|tamano_contenido|
+-----------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+
|in24.inetnebr.com|      -|         -|01/Aug/1995:00:00...|   GET|/shuttle/missions...| HTTP/1.0|          200|            1839|
|  uplherc.upl.com|      -|         -|01/Aug/1995:00:00...|   GET|                   /| HTTP/1.0|      

schema: org.apache.spark.sql.types.StructType = StructType(StructField(ip,StringType,false), StructField(cliente,StringType,false), StructField(id_usuario,StringType,false), StructField(fecha_hora,StringType,false), StructField(metodo,StringType,false), StructField(url,StringType,false), StructField(protocolo,StringType,false), StructField(cod_respuesta,IntegerType,false), StructField(tamano_contenido,IntegerType,false))
df: org.apache.spark.sql.DataFrame = [ip: string, cliente: string ... 7 more fields]


#### Guardaremos nuestro nuevo DataFrame ya estructurado en formato parquet. Y de este leeremos para realizar nuestro análisis.

In [69]:
df.write.parquet("Resultados_Scala/df_parquet")

In [70]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

sqlContext.read.parquet("Resultados_Scala/df_parquet").createOrReplaceTempView("A")

sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@7f3f119
import sqlContext.implicits._


In [72]:
sqlContext.sql("""select * from A limit 2""").show()

+--------------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+
|                  ip|cliente|id_usuario|          fecha_hora|metodo|                 url|protocolo|cod_respuesta|tamano_contenido|
+--------------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+
|www-c2.proxy.aol.com|      -|         -|14/Aug/1995:21:51...|   GET|/images/lc39a-log...| HTTP/1.0|          200|           13116|
|  rbroome.pr.mcs.net|      -|         -|14/Aug/1995:21:51...|   GET|/images/NASA-logo...| HTTP/1.0|          304|               0|
+--------------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+



# Analisis

### ¿Cuáles son los distintos protocolos web utilizados? Agrúpalos.

#### Scala

In [80]:
df
 .groupBy("protocolo")
 .count().sql("""select protocolo,
                    count(protocolo) as n_registros
             from A 
             where protocolo != ''
             group by protocolo
             order by n_registros desc
             
             """).show()
 .orderBy(desc("count"))
 .where($"protocolo" !== "")
 .show()

+---------+-------+
|protocolo|  count|
+---------+-------+
| HTTP/1.0|1566967|
|HTTP/V1.0|    163|
|        a|      1|
+---------+-------+



#### SQL

In [81]:
sqlContext.sql("""select protocolo,
                    count(protocolo) as n_registros
             from A 
             where protocolo != ''
             group by protocolo
             order by n_registros desc
             
             """).show()

+---------+-----------+
|protocolo|n_registros|
+---------+-----------+
| HTTP/1.0|    1566967|
|HTTP/V1.0|        163|
|        a|          1|
+---------+-----------+



### ¿Cuáles son los códigos de estado más comunes en la web? Agrúpalos y ordénalos para ver cuál es el más común.

In [85]:
df
 .groupBy("cod_respuesta")
 .count()
 .orderBy(desc("count"))
 .show()

+-------------+-------+
|cod_respuesta|  count|
+-------------+-------+
|          200|1398198|
|          304| 134138|
|          302|  26437|
|          404|  10020|
|          403|    171|
|          501|     27|
|          500|      3|
+-------------+-------+



### ¿Y los métodos de petición (verbos) más utilizados?

In [114]:
df
 .groupBy("metodo")
 .count()
 .orderBy("metodo")
 .show()

+------+-------+
|metodo|  count|
+------+-------+
|   GET|1564918|
|  HEAD|   3965|
|  POST|    111|
+------+-------+



### ¿Qué recurso tuvo la mayor transferencia de bytes de la página web?

In [133]:
df
 .groupBy("url")
 .agg(count("url").alias("n_registros"),
      sum("tamano_contenido").alias("total_transferencia"))
 .orderBy(desc("total_transferencia"))
 .select("url",
         "n_registros",
         "total_transferencia")
 .show(2,truncate = false)
 

+-------------------------------------------------+-----------+-------------------+
|url                                              |n_registros|total_transferencia|
+-------------------------------------------------+-----------+-------------------+
|/shuttle/missions/sts-71/movies/sts-71-launch.mpg|2158       |1639380464         |
|/shuttle/missions/sts-69/count69.gif             |24381      |1005927794         |
+-------------------------------------------------+-----------+-------------------+
only showing top 2 rows



### Además, queremos saber que recurso de nuestra web es el que más tráfico recibe. Es decir, el recurso con más registros en nuestro log.

In [159]:
df
 .groupBy("url")
 .count()
 .orderBy(desc("count"))
 .select("url","count")
 .show(1,truncate = false)

+--------------------------+-----+
|url                       |count|
+--------------------------+-----+
|/images/NASA-logosmall.gif|97383|
+--------------------------+-----+
only showing top 1 row



### ¿Qué días la web recibió más tráfico?

In [156]:
val df1 = df
            .withColumn("fecha",to_date($"fecha_hora","MM-dd-yyyy"))
            .withColumn("Hora",substring($"fecha_hora",13,8))
df1.cache().show(5)

+-----------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+-----+--------+
|               ip|cliente|id_usuario|          fecha_hora|metodo|                 url|protocolo|cod_respuesta|tamano_contenido|fecha|    Hora|
+-----------------+-------+----------+--------------------+------+--------------------+---------+-------------+----------------+-----+--------+
|in24.inetnebr.com|      -|         -|01/Aug/1995:00:00...|   GET|/shuttle/missions...| HTTP/1.0|          200|            1839| null|00:00:01|
|  uplherc.upl.com|      -|         -|01/Aug/1995:00:00...|   GET|                   /| HTTP/1.0|          304|               0| null|00:00:07|
|  uplherc.upl.com|      -|         -|01/Aug/1995:00:00...|   GET|/images/ksclogo-m...| HTTP/1.0|          304|               0| null|00:00:08|
|  uplherc.upl.com|      -|         -|01/Aug/1995:00:00...|   GET|/images/MOSAIC-lo...| HTTP/1.0|          304|               0| null|00

df1: org.apache.spark.sql.DataFrame = [ip: string, cliente: string ... 9 more fields]


### ¿Cuáles son los hosts son los más frecuentes?

In [160]:
df
 .groupBy("ip")
 .count()
 .orderBy(desc("count"))
 .show(5)

+--------------------+-----+
|                  ip|count|
+--------------------+-----+
|  edams.ksc.nasa.gov| 6530|
|piweba4y.prodigy.com| 4846|
|        163.206.89.4| 4791|
|piweba5y.prodigy.com| 4607|
|piweba3y.prodigy.com| 4416|
+--------------------+-----+
only showing top 5 rows



### ¿A qué horas se produce el mayor número de tráfico en la web?

In [162]:
df1.groupBy("Hora").count().orderBy(desc("count")).show(5)

+--------+-----+
|    Hora|count|
+--------+-----+
|14:28:00|   63|
|13:34:33|   60|
|15:30:46|   59|
|15:26:48|   57|
|15:48:25|   56|
+--------+-----+
only showing top 5 rows



### ¿Cuál es el número de errores 404 que ha habido cada día?

In [165]:
df1.where($"cod_respuesta" === "404")
 .groupBy("fecha")
 .count()
 .orderBy(desc("count"))
 .show(5)

+-----+-----+
|fecha|count|
+-----+-----+
| null|10020|
+-----+-----+



In [167]:
df1.where($"cod_respuesta" === "404").count()

res127: Long = 10020
