# Seminario de Estadística II - Tarea 1 Parte 2

**Integrantes del equipo:**
* Azpeitia Medina Samuel
* Castro Pérez Juan Antonio
* Rodríguez Rodríguez Donovan Zuriel

Primero vamos a configurar nuestro espacio de trabajo en Unity Catalog usando SQL. Con esto nos aseguramos de tener listos el catálogo, la base de datos y el volumen donde estarán los archivos que vamos a usar a lo largo del ejercicio.

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS dev;
CREATE DATABASE IF NOT EXISTS dev.ciencias_data;
CREATE VOLUME IF NOT EXISTS dev.ciencias_data.session_data;


### 1. Cree una tabla bronce en formato delta y particionado por hora para session_part1.csv.

**Solución:** 
Para este primer punto vamos a leer el archivo csv original indicando que el separador es el pipe `|`. Después, crearemos una columna nueva llamada `part_hour` que extraiga la fecha y la hora del campo `timestamp` para usarla como nuestra partición. Finalmente, guardamos el dataframe como una tabla en formato delta dentro de nuestra base de datos.

In [0]:
import ast
import re
import pyspark.sql.functions as F
from pyspark.sql.types import *

# Leemos el archivo csv de la primera parte
df_part1 = spark.read.format("csv").option("sep", "|").option("header", "true").load("/Volumes/dev/ciencias_data/session_data/sessions_part1.csv")

# Creamos la columna para particionar por hora
df_bronce = df_part1.withColumn("part_hour", F.date_format(F.to_timestamp("timestamp"), "yyyy-MM-dd-HH"))

# Guardamos la tabla particionada en formato delta
df_bronce.write.format("delta").mode("overwrite").option("overwriteSchema", "true").partitionBy("part_hour").saveAsTable("dev.ciencias_data.bronze_sessions")

# Visualizamos la tabla bronce
df_bronce.limit(20).display()

data,timestamp,part_hour
"{""dstDataBytes"": 407, ""dstBytes"": 1522, ""packetLen"": [0, 82, 82, 82, 82, 76, 76, 76, 76, 708, 708, 76, 76, 322, 322, 76, 76, 76, 76, 225, 225, 76, 76, 76, 76, 76, 76, 76, 76], ""srcPort"": 56349, ""totPackets"": 28, ""packetPos"": [-509959, 4459307263, 4459314230, 4459570254, 4459583364, 4459779909, 4459781695, 4459782151, 4459783915, 4459784143, 4459788224, 4460109395, 4460119039, 4461092274, 4461098178, 4462286446, 4462288188, 4464062765, 4464064371, 4464064447, 4464066284, 4464159634, 4464159710, 4464163243, 4464163319, 4464167891, 4464170139, 4464778857, 4464793098], ""srcPayload8"": ""474554202f617869"", ""segmentCnt"": 1, ""srcPackets"": 16, ""protocol"": [""http"", ""tcp""], ""lastPacket"": 1585023572472, ""dstPort"": 10003, ""dstASN"": ""AS17072 TOTAL PLAY TELECOMUNICACIONES SA DE CV"", ""communityId"": ""1:d0ANsAeV6L6uP/gD4Xp2iK52CUM="", ""dstPayload8"": ""485454502f312e31"", ""timestamp"": 1585023578003, ""srcBytes"": 2236, ""dstMacCnt"": 1, ""initRTT"": 14, ""srcIp"": ""192.151.112.163"", ""dstGEO"": ""MX"", ""firstPacket"": 1585023572196, ""srcDataBytes"": 638, ""dstMac"": [""00:09:0f:09:02:08""], ""length"": 275, ""srcMacCnt"": 1, ""totDataBytes"": 1045, ""ipProtocol"": 6, ""node"": ""localhost"", ""dstPackets"": 12, ""tcpflags"": {""rst"": 0, ""psh"": 6, ""dstZero"": 0, ""ack"": 14, ""syn"": 2, ""fin"": 4, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 2}, ""http"": {""serverVersion"": [""1.1""], ""bodyMagic"": [""text/plain""], ""uriCnt"": 1, ""methodCnt"": 1, ""useragent"": [""HTTP Image Reader""], ""responseHeader"": [""content-type"", ""authentication-info"", ""accept-ranges"", ""date"", ""connection""], ""clientVersion"": [""1.1""], ""statuscodeCnt"": 1, ""statuscode"": [200], ""md5Cnt"": 1, ""path"": [""/axis-cgi/admin/param.cgi""], ""authTypeCnt"": 1, ""pathCnt"": 1, ""keyCnt"": 3, ""host"": [""189.203.246.14""], ""serverVersionCnt"": 1, ""authType"": [""digest""], ""hostCnt"": 1, ""bodyMagicCnt"": 1, ""useragentCnt"": 1, ""key"": [""usergroup"", ""group"", ""action""], ""method"": [""GET""], ""userCnt"": 1, ""uri"": [""189.203.246.14/axis-cgi/admin/param.cgi?usergroup=admin&action=list&group=Brand.ProdNbr,Brand.ProdFullName,Network.eth0.MACAddress,Properties.Firmware.Version""], ""requestHeaderCnt"": 4, ""clientVersionCnt"": 1, ""requestHeader"": [""user-agent"", ""connection"", ""host"", ""authorization""], ""responseHeaderCnt"": 5, ""user"": [""root""], ""md5"": [""baf009daeea7b16fb8811fdd8b3c0197""]}, ""protocolCnt"": 2, ""totBytes"": 3758, ""srcMac"": [""00:08:e3:ff:fc:28""], ""dstIp"": ""189.203.246.14"", ""srcGEO"": ""US"", ""fileId"": [509959]}",2020-03-23 22:19:38.003,2020-03-23-22
"{""dstDataBytes"": 0, ""srcASN"": ""AS18734 Operbes, S.A. de C.V."", ""dstBytes"": 132, ""packetLen"": [0, 82, 82, 82, 82, 82, 82], ""srcPort"": 41240, ""totPackets"": 6, ""packetPos"": [-509815, 10777144, 10777226, 15088073, 15088155, 15110048, 15112068], ""segmentCnt"": 1, ""srcPackets"": 4, ""protocol"": [""tcp""], ""lastPacket"": 1585009201984, ""dstPort"": 443, ""dstASN"": ""AS6453 TATA COMMUNICATIONS (AMERICA) INC"", ""communityId"": ""1:3VSgW21ZKK4G8tQ8b6qs37ZDSpc="", ""timestamp"": 1585009207002, ""srcBytes"": 264, ""dstMacCnt"": 1, ""srcIp"": ""201.140.104.57"", ""dstGEO"": ""CA"", ""firstPacket"": 1585009201920, ""srcDataBytes"": 0, ""dstMac"": [""2c:21:72:d0:78:f6""], ""length"": 63, ""srcMacCnt"": 1, ""totDataBytes"": 0, ""ipProtocol"": 6, ""node"": ""localhost"", ""dstPackets"": 2, ""tcpflags"": {""rst"": 0, ""psh"": 0, ""dstZero"": 0, ""ack"": 2, ""syn"": 0, ""fin"": 4, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 0}, ""protocolCnt"": 1, ""totBytes"": 396, ""srcMac"": [""44:2b:03:53:d7:80""], ""dstIp"": ""66.110.49.32"", ""srcGEO"": ""MX"", ""fileId"": [509815]}",2020-03-23 18:20:07.002,2020-03-23-18
"{""dstDataBytes"": 0, ""srcASN"": ""AS132203 Tencent Building, Kejizhongyi Avenue"", ""dstBytes"": 212, ""packetLen"": [0, 122, 122, 122, 122], ""srcPort"": 0, ""totPackets"": 4, ""packetPos"": [-509957, 5130581059, 5130581181, 5130581803, 5130581925], ""icmp"": {""code"": [0], ""type"": [8, 0]}, ""segmentCnt"": 1, ""srcPackets"": 2, ""protocol"": [""icmp""], ""lastPacket"": 1585023568783, ""dstPort"": 0, ""dstASN"": ""AS18734 Operbes, S.A. de C.V."", ""timestamp"": 1585023579005, ""srcBytes"": 212, ""dstMacCnt"": 1, ""srcIp"": ""49.51.84.154"", ""dstGEO"": ""MX"", ""firstPacket"": 1585023568783, ""srcDataBytes"": 0, ""dstMac"": [""44:2b:03:53:d7:80""], ""length"": 0, ""srcMacCnt"": 1, ""totDataBytes"": 0, ""ipProtocol"": 1, ""node"": ""localhost"", ""dstPackets"": 2, ""protocolCnt"": 1, ""totBytes"": 424, ""srcMac"": [""2c:21:72:d0:78:f6""], ""dstIp"": ""201.140.104.1"", ""srcGEO"": ""US"", ""fileId"": [509957]}",2020-03-23 22:19:39.005,2020-03-23-22
"{""dstDataBytes"": 0, ""dstBytes"": 120, ""packetLen"": [0, 76, 76, 76, 76, 76, 76], ""srcPort"": 49732, ""totPackets"": 6, ""packetPos"": [-509816, 41607922, 41608074, 41609397, 41609783, 41624790, 41633411], ""segmentCnt"": 1, ""srcPackets"": 4, ""protocol"": [""tcp""], ""lastPacket"": 1585009202730, ""dstPort"": 443, ""dstASN"": ""AS8075 Microsoft Corporation"", ""communityId"": ""1:5mujtPj1V3BQr5dQiLmclZ5J2fw="", ""timestamp"": 1585009208001, ""srcBytes"": 240, ""dstMacCnt"": 1, ""srcIp"": ""10.33.130.66"", ""dstGEO"": ""US"", ""firstPacket"": 1585009202729, ""srcDataBytes"": 0, ""dstMac"": [""00:09:0f:09:02:08""], ""length"": 0, ""srcMacCnt"": 1, ""totDataBytes"": 0, ""ipProtocol"": 6, ""node"": ""localhost"", ""dstPackets"": 2, ""tcpflags"": {""rst"": 0, ""psh"": 0, ""dstZero"": 0, ""ack"": 2, ""syn"": 0, ""fin"": 4, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 0}, ""protocolCnt"": 1, ""totBytes"": 360, ""srcMac"": [""00:08:e3:ff:fc:28""], ""dstIp"": ""52.109.2.20"", ""fileId"": [509816]}",2020-03-23 18:20:08.001,2020-03-23-18
"{""dstDataBytes"": 172, ""dstBytes"": 188, ""packetLen"": [0, 94, 94, 110, 110], ""srcPort"": 52806, ""totPackets"": 4, ""packetPos"": [-509957, 4748727799, 4748727893, 4748914341, 4748914451], ""srcPayload8"": ""04d2010000010000"", ""segmentCnt"": 1, ""srcPackets"": 2, ""protocol"": [""udp"", ""dns""], ""lastPacket"": 1585023548679, ""dstPort"": 53, ""dstASN"": ""AS15169 Google LLC"", ""communityId"": ""1:TKvfxJDM61MS9n6wX6J59Pcent4="", ""dstPayload8"": ""04d2818000010001"", ""timestamp"": 1585023579005, ""srcBytes"": 156, ""dstMacCnt"": 1, ""srcIp"": ""192.168.1.2"", ""dstGEO"": ""US"", ""firstPacket"": 1585023548670, ""srcDataBytes"": 140, ""dstMac"": [""f4:c6:13:e7:b2:20""], ""length"": 8, ""srcMacCnt"": 1, ""dns"": {""qt"": [""A""], ""ip"": [""198.41.0.4""], ""qtCnt"": 1, ""opcodeCnt"": 1, ""opcode"": [""QUERY""], ""statusCnt"": 1, ""GEO"": [""US""], ""qcCnt"": 1, ""qc"": [""IN""], ""host"": [""a.root-servers.net""], ""ipCnt"": 1, ""RIR"": [null], ""ASN"": [""AS20172 VeriSign Global Registry Services""], ""hostCnt"": 1, ""status"": [""NOERROR""]}, ""totDataBytes"": 312, ""ipProtocol"": 17, ""node"": ""localhost"", ""dstPackets"": 2, ""protocolCnt"": 2, ""totBytes"": 344, ""srcMac"": [""60:e3:27:46:10:29""], ""dstIp"": ""8.8.8.8"", ""fileId"": [509957]}",2020-03-23 22:19:39.005,2020-03-23-22
"{""dstDataBytes"": 0, ""srcASN"": ""AS8151 Uninet S.A. de C.V."", ""dstBytes"": 360, ""packetLen"": [0, 101, 101, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76], ""srcPort"": 50271, ""totPackets"": 14, ""packetPos"": [-509816, 66293772, 66293873, 66303940, 66304118, 66308498, 66308676, 66313136, 66313212, 66342584, 66347035, 67331074, 67331150, 67386764, 67386840], ""segmentCnt"": 1, ""srcPackets"": 8, ""protocol"": [""tcp""], ""lastPacket"": 1585009203407, ""dstPort"": 443, ""dstASN"": ""AS18734 Operbes, S.A. de C.V."", ""communityId"": ""1:iyvKid86f+nBUU69sTL3SfLKngA="", ""timestamp"": 1585009209002, ""srcBytes"": 530, ""dstMacCnt"": 1, ""srcIp"": ""187.151.20.154"", ""dstGEO"": ""MX"", ""firstPacket"": 1585009203382, ""srcDataBytes"": 0, ""dstMac"": [""44:2b:03:53:d7:80""], ""length"": 24, ""srcMacCnt"": 1, ""totDataBytes"": 0, ""ipProtocol"": 6, ""node"": ""localhost"", ""dstPackets"": 6, ""tcpflags"": {""rst"": 2, ""psh"": 2, ""dstZero"": 0, ""ack"": 6, ""syn"": 0, ""fin"": 4, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 0}, ""protocolCnt"": 1, ""totBytes"": 890, ""srcMac"": [""2c:21:72:d0:78:f6""], ""dstIp"": ""201.140.104.13"", ""srcGEO"": ""MX"", ""fileId"": [509816]}",2020-03-23 18:20:09.002,2020-03-23-18
"{""dstDataBytes"": 83, ""dstBytes"": 91, ""packetLen"": [0, 103, 107], ""srcPort"": 41517, ""totPackets"": 2, ""packetPos"": [-509959, 4002064563, 4002071340], ""srcPayload8"": ""302b020101040661"", ""segmentCnt"": 1, ""srcPackets"": 1, ""protocol"": [""udp"", ""snmp""], ""lastPacket"": 1585023548542, ""dstPort"": 161, ""communityId"": ""1:rxWvyYQ2zsmIc5m8r39ri1npg98="", ""dstPayload8"": ""302f020101040661"", ""timestamp"": 1585023579005, ""srcBytes"": 87, ""dstMacCnt"": 1, ""srcIp"": ""10.10.16.197"", ""firstPacket"": 1585023548542, ""srcDataBytes"": 79, ""dstMac"": [""00:c8:8b:d3:ba:cd""], ""length"": 0, ""srcMacCnt"": 1, ""totDataBytes"": 162, ""ipProtocol"": 17, ""node"": ""localhost"", ""dstPackets"": 1, ""protocolCnt"": 2, ""totBytes"": 178, ""srcMac"": [""00:08:e3:ff:fc:28""], ""dstIp"": ""10.33.255.98"", ""fileId"": [509959]}",2020-03-23 22:19:39.005,2020-03-23-22
"{""dstDataBytes"": 5251, ""dstBytes"": 11360, ""packetLen"": [0, 82, 82, 76, 76, 587, 76, 1458, 1530, 1318, 76, 1458, 1530, 1318, 76, 1225, 76, 1225, 76, 196, 76, 196, 76, 76, 76, 76, 76, 76, 76], ""cert"": [{""issuerCN"": [""digicert global root ca""], ""subjectON"": [""DigiCert Inc""], ""notAfter"": 1825503825000, ""serial"": ""0546fe1823f7e1941da39fce14c46173"", ""remainingDays"": 2783, ""issuerON"": [""DigiCert Inc""], ""hash"": ""7c:cc:2a:87:e3:94:9f:20:57:2b:18:48:29:80:50:5f:a9:0c:ac:3b"", ""subjectCN"": [""geotrust rsa ca 2018""], ""notBefore"": 1509971025000, ""validDays"": 3652}, {""issuerCN"": [""geotrust rsa ca 2018""], ""subjectON"": [""Webedia S.A.""], ""notAfter"": 1593345600000, ""serial"": ""02ae8e830b4f47fe2b67b35617521352"", ""alt"": [""web.crea.acsta.net"", ""admin.tastelyapp.com"", ""m.hm.tudogostoso.com.br"", ""tr.web.img3.acsta.net"", ""api.origin.tudogostoso.com.br"", ""fr.web.img2.acsta.net"", ""api.sensacine.com"", ""hm.tudogostoso.com.br"", ""www.tudogostoso.com.br"", ""api.tudogostoso.com.br"", ""br.web.img2.acsta.net"", ""blog.hm.tudogostoso.com.br"", ""www.segredosdesalao.com.br"", ""www.allocine.fr"", ""www.conquistesuavida.com.br"", ""tr.web.img1.acsta.net"", ""admin.tudogostoso.com.br"", ""m.origin.tudogostoso.com.br"", ""www.purepeople.com.br"", ""amp.purepeople.com.br"", ""br.web.img3.acsta.net"", ""de.web.img3.acsta.net"", ""www.sensacine.lat"", ""dermaclub.com.br"", ""amp.tudogostoso.com.br"", ""api.adorocinema.com"", ""www.sensacine.com.mx"", ""www.sorrisologia.com.br"", ""www.illicofresco.com"", ""tr.web.img2.acsta.net"", ""br.web.img4.acsta.net"", ""www.casapraticaqualita.com.br"", ""blog.tudogostoso.com.br"", ""www.sensacine.cl"", ""api.filmstarts.de"", ""br.web.img1.acsta.net"", ""amp.hm.tudogostoso.com.br"", ""www.purebreak.com.br"", ""api.webediamovies.pro"", ""api.tastelyapp.com"", ""amp.purebreak.com.br"", ""es.web.img3.acsta.net"", ""api.beyazperde.com"", ""de.web.img4.acsta.net"", ""www.tastelyapp.com"", ""www.sensacine.com"", ""de.web.img2.acsta.net"", ""www.sensacine.com.co"", ""webedia.mgr.consensu.org"", ""www.semprepronta.com"", ""www.webediamovies.pro"", ""www.millenium.us.org"", ""semprepronta.com"", ""fr.web.img3.acsta.net"", ""tr.web.img4.acsta.net"", ""es.web.img1.acsta.net"", ""www.preparadopravaler.com.br"", ""fr.web.img4.acsta.net"", ""www.sensacine.com.ar"", ""api.allocine.fr"", ""origin.tudogostoso.com.br"", ""de.web.img1.acsta.net"", ""cmp.webedia.mgr.consensu.org"", ""www.alemdasuperficie.org"", ""m.tudogostoso.com.br"", ""api.sensacine.com.mx"", ""fr.web.img1.acsta.net"", ""es.web.img2.acsta.net"", ""es.web.img4.acsta.net"", ""www.dermaclub.com.br""], ""remainingDays"": 96, ""issuerON"": [""DigiCert Inc""], ""altCnt"": 70, ""hash"": ""da:72:ca:ba:d8:94:0c:f1:49:7c:44:64:dd:23:8c:9d:3a:22:ae:84"", ""subjectCN"": [""web.crea.acsta.net""], ""notBefore"": 1562889600000, ""validDays"": 352}], ""srcPort"": 54867, ""totPackets"": 28, ""packetPos"": [-509815, 76540573, 76552643, 76554752, 76558624, 76621651, 76646109, 80683235, 80684693, 80686223, 80744454, 80810905, 80812967, 80814877, 80823531, 80826955, 80829636, 80862580, 80879126, 81004125, 81004599, 81010761, 81011235, 81013814, 81013890, 81018745, 81030370, 81030446, 81031171], ""srcPayload8"": ""1603010200010001"", ""segmentCnt"": 1, ""srcPackets"": 13, ""protocol"": [""tls"", ""tcp""], ""lastPacket"": 1585009203359, ""dstPort"": 443, ""dstASN"": ""AS16625 Akamai Technologies, Inc."", ""communityId"": ""1:HFICA9UWiHl1zHrTVhrKXBBcqyA="", ""dstPayload8"": ""160303004e020000"", ""timestamp"": 1585009209004, ""srcBytes"": 1537, ""dstMacCnt"": 1, ""initRTT"": 0, ""srcIp"": ""10.33.224.109"", ""dstGEO"": ""US"", ""firstPacket"": 1585009203286, ""srcDataBytes"": 643, ""dstMac"": [""00:09:0f:09:02:08""], ""length"": 73, ""srcMacCnt"": 1, ""tagsCnt"": 1, ""totDataBytes"": 5894, ""ipProtocol"": 6, ""tags"": [""acked-unseen-segment-dst""], ""node"": ""localhost"", ""dstPackets"": 15, ""tcpflags"": {""rst"": 0, ""psh"": 9, ""dstZero"": 0, ""ack"": 13, ""syn"": 1, ""fin"": 4, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 1}, ""http"": {""host"": [""www.sensacine.com.mx""], ""hostCnt"": 1}, ""protocolCnt"": 2, ""totBytes"": 12897, ""srcMac"": [""00:08:e3:ff:fc:28""], ""tls"": {""cipher"": [""TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384""], ""srcSessionId"": [""2728bdc70277c1b6aa8070eaa358b4b45a55c03a1b1d8899f1bfa81b2e4bd6e4""], ""cipherCnt"": 1, ""ja3s"": [""0aaef804a3cc5ed491dc0bb3e151a547""], ""versionCnt"": 1, ""ja3sCnt"": 1, ""version"": [""TLSv1.2""], ""ja3Cnt"": 1, ""ja3"": [""66918128f1b9b03303d77c6f2eefd128""]}, ""dstIp"": ""23.3.183.64"", ""certCnt"": 2, ""fileId"": [509815]}",2020-03-23 18:20:09.004,2020-03-23-18
"{""dstDataBytes"": 0, ""srcASN"": ""AS8151 Uninet S.A. de C.V."", ""dstBytes"": 32146, ""packetLen"": [0, 739, 739, 482, 482, 82, 82, 767, 767, 82, 82, 1430, 1430, 1430, 1430, 1430, 1430, 1430, 1430, 82, 82, 82, 82, 82, 82, 1430, 1430, 1430, 1430, 1430, 1430, 1430, 254, 1430, 254, 82, 82, 82, 82, 82, 82, 718, 718, 82, 82, 1430, 1430, 1430, 1430, 1047, 1047, 82, 82, 82, 82, 82, 82, 82, 82], ""srcPort"": 58906, ""totPackets"": 58, ""packetPos"": [-509957, 2777173177, 2777173916, 2777230404, 2777230886, 2779125432, 2779127310, 2921600208, 2921600975, 2922578516, 2922580128, 2922938554, 2922939984, 2922942944, 2922944374, 2922945804, 2922947234, 2922948664, 2922950094, 2925266832, 2925266914, 2925266996, 2925267078, 2925273356, 2925273438, 2925306883, 2925309843, 2925316141, 2925319101, 2925326929, 2925331419, 2925332849, 2925334279, 2925334533, 2925335963, 2928234671, 2928234753, 2928237895, 2928237977, 2928369941, 2928370023, 2929216078, 2929216796, 2929269617, 2929269699, 2929421610, 2929423040, 2929426000, 2929427430, 2929428860, 2929429907, 2932231130, 2932231212, 5234621475, 5234621557, 5234644514, 5234644596, 5236257060, 5236257224], ""segmentCnt"": 1, ""srcPackets"": 26, ""protocol"": [""tcp""], ""lastPacket"": 1585023574401, ""dstPort"": 80, ""dstASN"": ""AS18734 Operbes, S.A. de C.V."", ""communityId"": ""1:NJ8e8j8HvHgwJlga+4FUGq1e6AY="", ""timestamp"": 1585023580004, ""srcBytes"": 5672, ""dstMacCnt"": 1, ""srcIp"": ""187.237.231.13"", ""dstGEO"": ""MX"", ""firstPacket"": 1585023449623, ""srcDataBytes"": 0, ""dstMac"": [""44:2b:03:53:d7:80""], ""length"": 124778, ""srcMacCnt"": 1, ""totDataBytes"": 0, ""ipProtocol"": 6, ""node"": ""localhost"", ""dstPackets"": 32, ""tcpflags"": {""rst"": 0, ""psh"": 12, ""dstZero"": 0, ""ack"": 42, ""syn"": 0, ""fin"": 4, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 0}, ""protocolCnt"": 1, ""totBytes"": 37818, ""srcMac"": [""2c:21:72:d0:78:f6""], ""dstIp"": ""201.140.104.13"", ""srcGEO"": ""MX"", ""fileId"": [509957]}",2020-03-23 22:19:40.004,2020-03-23-22
"{""dstDataBytes"": 0, ""srcASN"": ""AS8151 Uninet S.A. de C.V."", ""dstBytes"": 296, ""packetLen"": [0, 121, 121, 90, 90, 90, 90, 90, 90, 90, 90], ""srcPort"": 55303, ""totPackets"": 10, ""packetPos"": [-509815, 150882764, 150883191, 150892405, 150892495, 150943076, 150943166, 150948380, 150948470, 163353892, 163354177], ""segmentCnt"": 1, ""srcPackets"": 6, ""protocol"": [""tcp""], ""lastPacket"": 1585009205316, ""dstPort"": 443, ""dstASN"": ""AS262922 Gobierno del Estado de Mexico"", ""communityId"": ""1:vk7v4g6Ol6PAz2ozXr8+IaQhn30="", ""timestamp"": 1585009211001, ""srcBytes"": 506, ""dstMacCnt"": 1, ""srcIp"": ""2806:104e:16:1d32:71e6:ee76:a9eb:64bf"", ""dstGEO"": ""MX"", ""firstPacket"": 1585009205014, ""srcDataBytes"": 0, ""dstMac"": [""44:2b:03:53:d7:80""], ""length"": 302, ""srcMacCnt"": 1, ""totDataBytes"": 0, ""ipProtocol"": 6, ""node"": ""localhost"", ""dstPackets"": 4, ""tcpflags"": {""rst"": 0, ""psh"": 2, ""dstZero"": 0, ""ack"": 2, ""syn"": 0, ""fin"": 6, ""urg"": 0, ""srcZero"": 0, ""syn-ack"": 0}, ""protocolCnt"": 1, ""totBytes"": 802, ""srcMac"": [""2c:21:72:d0:78:f6""], ""dstIp"": ""2801:c4:15:200::84"", ""srcGEO"": ""MX"", ""fileId"": [509815]}",2020-03-23 18:20:11.001,2020-03-23-18


### 2. El dataset de session part1.csv corresponden a datos de tráfico de red realizados por el sniffer Arkime, cada fila es una sesión. A continuación realice lo siguiente:

**Investigue el posible significado de cada campo y redacte un posible diccionario de datos.**

**Solución:**
Buscando información sobre el sniffer Arkime, armamos este diccionario de datos básico:
* **timestamp**: Es la fecha y hora exacta en la que se capturó la sesión.
* **srcIp / dstIp**: La dirección IP de origen (quien inició la conexión) y la de destino.
* **srcPort / dstPort**: Los puertos que se usaron para la comunicación.
* **srcMac / dstMac**: Arreglo con las direcciones MAC físicas por donde pasaron los datos.
* **protocol**: Arreglo de los protocolos de red identificados (por ejemplo: tcp, udp, dns).
* **totPackets / srcPackets / dstPackets**: El número de paquetes que se mandaron y recibieron en total.
* **totBytes**: El tamaño total de toda la sesión en bytes.
* **totDataBytes**: El tamaño en bytes pero solo de los datos útiles (sin contar encabezados de red).
* **firstPacket / lastPacket**: El momento (en epoch) en el que se vio el primer y el último paquete.
* **packetLen**: Un arreglo que dice cuánto pesó cada uno de los paquetes enviados.
* **srcGEO / dstGEO**: El país de donde viene y a donde va la conexión, sacado por la IP.
* **http**: Datos extra si la conexión fue a una página web, como el host o URL.

**Estructure la información en un dataframe donde cada columna corresponda una clave del json y cada fila son los valores del dataframe, puede ignorar las siguientes claves del json cert y packetPos**

**Solución:** 
Para extraer la información del JSON de forma segura y evitar que algún error de formato en el texto rompa el proceso, vamos a crear una función (UDF) apoyándonos en la librería `ast` de Python. 

Para cumplir con la instrucción de ignorar las claves `cert` y `packetPos`, lo que haremos será no incluirlas cuando definamos nuestro esquema (`StructType`). De esta manera, al momento en que PySpark aplane los datos, descartará automáticamente esos dos campos sin necesidad de borrarlos después.

In [0]:
import ast
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf, explode, size

# Armamos el esquema sin poner cert ni packetPos
esquema_json = StructType([
    StructField("srcIp", StringType(), True),
    StructField("dstIp", StringType(), True),
    StructField("srcPort", LongType(), True),
    StructField("dstPort", LongType(), True),
    StructField("srcMac", ArrayType(StringType()), True),
    StructField("dstMac", ArrayType(StringType()), True),
    StructField("protocol", ArrayType(StringType()), True),
    StructField("totPackets", LongType(), True),
    StructField("srcPackets", LongType(), True),
    StructField("dstPackets", LongType(), True),
    StructField("totBytes", LongType(), True),
    StructField("srcBytes", LongType(), True),
    StructField("dstBytes", LongType(), True),
    StructField("totDataBytes", StringType(), True),
    StructField("firstPacket", LongType(), True),
    StructField("lastPacket", LongType(), True),
    StructField("srcGEO", StringType(), True),
    StructField("dstGEO", StringType(), True),
    StructField("packetLen", ArrayType(LongType()), True),
    StructField("http", StructType([StructField("host", ArrayType(StringType()), True)]), True)
])

esquema_salida = ArrayType(esquema_json)

# Funcion para parsear el json
def parseo_seguro(data_str):
    if data_str is None:
        return[]
    try:
        parsed = ast.literal_eval(data_str)
        if isinstance(parsed, list):
            return parsed
        elif isinstance(parsed, dict):
            return [parsed]
        return[]
    except Exception as e:
        return[]

udf_parseo = udf(parseo_seguro, esquema_salida)

# Aplicamos la funcion a la tabla bronce que hicimos en el paso 1
df_parseado = df_bronce.withColumn("data_parsed", udf_parseo(col("data")))
df_parseado = df_parseado.filter(size(col("data_parsed")) > 0)
df_explotado = df_parseado.select("timestamp", "part_hour", explode(col("data_parsed")).alias("columna_json"))

# Aplanamos la estructura para que queden como columnas normales
df_estructurado = df_explotado.select("timestamp", "part_hour", "columna_json.*")

#Visualizamos la tabla
df_estructurado.limit(20).display()

timestamp,part_hour,srcIp,dstIp,srcPort,dstPort,srcMac,dstMac,protocol,totPackets,srcPackets,dstPackets,totBytes,srcBytes,dstBytes,totDataBytes,firstPacket,lastPacket,srcGEO,dstGEO,packetLen,http
2020-03-23 22:19:38.003,2020-03-23-22,192.151.112.163,189.203.246.14,56349,10003,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(http, tcp)",28,16,12,3758,2236,1522,1045,1585023572196,1585023572472,US,MX,"List(0, 82, 82, 82, 82, 76, 76, 76, 76, 708, 708, 76, 76, 322, 322, 76, 76, 76, 76, 225, 225, 76, 76, 76, 76, 76, 76, 76, 76)",List(List(189.203.246.14))
2020-03-23 18:20:07.002,2020-03-23-18,201.140.104.57,66.110.49.32,41240,443,List(44:2b:03:53:d7:80),List(2c:21:72:d0:78:f6),List(tcp),6,4,2,396,264,132,0,1585009201920,1585009201984,MX,CA,"List(0, 82, 82, 82, 82, 82, 82)",
2020-03-23 22:19:39.005,2020-03-23-22,49.51.84.154,201.140.104.1,0,0,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(icmp),4,2,2,424,212,212,0,1585023568783,1585023568783,US,MX,"List(0, 122, 122, 122, 122)",
2020-03-23 18:20:08.001,2020-03-23-18,10.33.130.66,52.109.2.20,49732,443,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),List(tcp),6,4,2,360,240,120,0,1585009202729,1585009202730,,US,"List(0, 76, 76, 76, 76, 76, 76)",
2020-03-23 18:20:09.002,2020-03-23-18,187.151.20.154,201.140.104.13,50271,443,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),14,8,6,890,530,360,0,1585009203382,1585009203407,MX,MX,"List(0, 101, 101, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76)",
2020-03-23 22:19:39.005,2020-03-23-22,10.10.16.197,10.33.255.98,41517,161,List(00:08:e3:ff:fc:28),List(00:c8:8b:d3:ba:cd),"List(udp, snmp)",2,1,1,178,87,91,162,1585023548542,1585023548542,,,"List(0, 103, 107)",
2020-03-23 18:20:09.004,2020-03-23-18,10.33.224.109,23.3.183.64,54867,443,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(tls, tcp)",28,13,15,12897,1537,11360,5894,1585009203286,1585009203359,,US,"List(0, 82, 82, 76, 76, 587, 76, 1458, 1530, 1318, 76, 1458, 1530, 1318, 76, 1225, 76, 1225, 76, 196, 76, 196, 76, 76, 76, 76, 76, 76, 76)",List(List(www.sensacine.com.mx))
2020-03-23 22:19:40.004,2020-03-23-22,187.237.231.13,201.140.104.13,58906,80,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),58,26,32,37818,5672,32146,0,1585023449623,1585023574401,MX,MX,"List(0, 739, 739, 482, 482, 82, 82, 767, 767, 82, 82, 1430, 1430, 1430, 1430, 1430, 1430, 1430, 1430, 82, 82, 82, 82, 82, 82, 1430, 1430, 1430, 1430, 1430, 1430, 1430, 254, 1430, 254, 82, 82, 82, 82, 82, 82, 718, 718, 82, 82, 1430, 1430, 1430, 1430, 1047, 1047, 82, 82, 82, 82, 82, 82, 82, 82)",
2020-03-23 18:20:11.001,2020-03-23-18,2806:104e:16:1d32:71e6:ee76:a9eb:64bf,2801:c4:15:200::84,55303,443,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),10,6,4,802,506,296,0,1585009205014,1585009205316,MX,MX,"List(0, 121, 121, 90, 90, 90, 90, 90, 90, 90, 90)",
2020-03-23 22:19:40.004,2020-03-23-22,10.0.53.229,10.0.6.113,19390,514,List(00:08:e3:ff:fc:28),List(0c:c4:7a:fb:e3:b8),"List(tls, tcp)",23,12,11,8869,4206,4663,7353,1585023574634,1585023574668,,,"List(0, 90, 90, 82, 522, 82, 1430, 1337, 82, 82, 1430, 1105, 82, 1316, 646, 161, 113, 113, 76, 82, 76, 82, 76, 82)",List(List(fortinet-ca2.fortinet.com))


**Sustituya la columna packetLen por los sumarizados: suma total, media, mínimo y máximo. Y asigne el tipo correcto a los datos por ejemplo lastPacket debe ser transformado a fecha.**

**Solución:** 
Primero vamos a convertir los campos `firstPacket` y `lastPacket` a formato Timestamp. Como vienen en milisegundos, tenemos que dividirlos entre 1000. También aprovechamos para castear `totDataBytes` que a veces viene como string vacío.
Después, usaremos funciones de arreglos de Spark para sacar las estadísticas de `packetLen` y borraremos la columna original.

In [0]:
import pyspark.sql.functions as F

# 1. Arreglamos los tipos de datos y fechas
df_tipos = df_estructurado.withColumn("totDataBytes", F.when(F.col("totDataBytes") == "", None).otherwise(F.col("totDataBytes")).cast(LongType()))
df_tipos = df_tipos.withColumn("firstPacket", (F.col("firstPacket") / 1000).cast(TimestampType()))
df_tipos = df_tipos.withColumn("lastPacket", (F.col("lastPacket") / 1000).cast(TimestampType()))

# 2. Sacamos las estadisticas del arreglo packetLen
df_tipos = df_tipos.withColumn("tamano_array", F.size(F.col("packetLen")))
df_tipos = df_tipos.withColumn("packet_len_suma", F.expr("aggregate(packetLen, 0L, (acc, x) -> acc + x)"))
df_tipos = df_tipos.withColumn("packet_len_min", F.array_min(F.col("packetLen")))
df_tipos = df_tipos.withColumn("packet_len_max", F.array_max(F.col("packetLen")))
df_tipos = df_tipos.withColumn("packet_len_media", F.col("packet_len_suma") / F.col("tamano_array"))

# Borramos la columna original como pide la instruccion
df_transformado = df_tipos.drop("packetLen", "tamano_array")

#Visualizamos la tabla
df_transformado.limit(20).display()

timestamp,part_hour,srcIp,dstIp,srcPort,dstPort,srcMac,dstMac,protocol,totPackets,srcPackets,dstPackets,totBytes,srcBytes,dstBytes,totDataBytes,firstPacket,lastPacket,srcGEO,dstGEO,http,packet_len_suma,packet_len_min,packet_len_max,packet_len_media
2020-03-23 22:19:38.003,2020-03-23-22,192.151.112.163,189.203.246.14,56349,10003,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(http, tcp)",28,16,12,3758,2236,1522,1045,2020-03-24T04:19:32.196Z,2020-03-24T04:19:32.472Z,US,MX,List(List(189.203.246.14)),4206,0,708,145.0344827586207
2020-03-23 18:20:07.002,2020-03-23-18,201.140.104.57,66.110.49.32,41240,443,List(44:2b:03:53:d7:80),List(2c:21:72:d0:78:f6),List(tcp),6,4,2,396,264,132,0,2020-03-24T00:20:01.920Z,2020-03-24T00:20:01.984Z,MX,CA,,492,0,82,70.28571428571429
2020-03-23 22:19:39.005,2020-03-23-22,49.51.84.154,201.140.104.1,0,0,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(icmp),4,2,2,424,212,212,0,2020-03-24T04:19:28.783Z,2020-03-24T04:19:28.783Z,US,MX,,488,0,122,97.6
2020-03-23 18:20:08.001,2020-03-23-18,10.33.130.66,52.109.2.20,49732,443,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),List(tcp),6,4,2,360,240,120,0,2020-03-24T00:20:02.729Z,2020-03-24T00:20:02.730Z,,US,,456,0,76,65.14285714285714
2020-03-23 18:20:09.002,2020-03-23-18,187.151.20.154,201.140.104.13,50271,443,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),14,8,6,890,530,360,0,2020-03-24T00:20:03.382Z,2020-03-24T00:20:03.407Z,MX,MX,,1114,0,101,74.26666666666667
2020-03-23 22:19:39.005,2020-03-23-22,10.10.16.197,10.33.255.98,41517,161,List(00:08:e3:ff:fc:28),List(00:c8:8b:d3:ba:cd),"List(udp, snmp)",2,1,1,178,87,91,162,2020-03-24T04:19:08.542Z,2020-03-24T04:19:08.542Z,,,,210,0,107,70.0
2020-03-23 18:20:09.004,2020-03-23-18,10.33.224.109,23.3.183.64,54867,443,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(tls, tcp)",28,13,15,12897,1537,11360,5894,2020-03-24T00:20:03.286Z,2020-03-24T00:20:03.359Z,,US,List(List(www.sensacine.com.mx)),13345,0,1530,460.1724137931034
2020-03-23 22:19:40.004,2020-03-23-22,187.237.231.13,201.140.104.13,58906,80,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),58,26,32,37818,5672,32146,0,2020-03-24T04:17:29.623Z,2020-03-24T04:19:34.401Z,MX,MX,,38746,0,1430,656.7118644067797
2020-03-23 18:20:11.001,2020-03-23-18,2806:104e:16:1d32:71e6:ee76:a9eb:64bf,2801:c4:15:200::84,55303,443,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),10,6,4,802,506,296,0,2020-03-24T00:20:05.014Z,2020-03-24T00:20:05.316Z,MX,MX,,962,0,121,87.45454545454545
2020-03-23 22:19:40.004,2020-03-23-22,10.0.53.229,10.0.6.113,19390,514,List(00:08:e3:ff:fc:28),List(0c:c4:7a:fb:e3:b8),"List(tls, tcp)",23,12,11,8869,4206,4663,7353,2020-03-24T04:19:34.634Z,2020-03-24T04:19:34.668Z,,,List(List(fortinet-ca2.fortinet.com)),9237,0,1430,384.875


**Cree una tabla silver con la información transformada, los nombres de los campos deben estar en snake_case y carga únicamente la información de session_part1.csv**

**Solución:** 
Para no cambiar los nombres uno por uno a mano, vamos a hacer un ciclo `for` que pase por todas las columnas y use una expresión regular para convertir las mayúsculas en minúsculas separadas por un guión bajo (snake_case). Al final, guardamos todo en nuestra base de datos.

In [0]:
import re

# Funcion sencilla en python para cambiar formato a snake_case
def convertir_snake_case(nombre):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', nombre)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df_silver_final = df_transformado

# Recorremos todas las columnas y las renombramos
for columna in df_silver_final.columns:
    df_silver_final = df_silver_final.withColumnRenamed(columna, convertir_snake_case(columna))

# Guardamos como tabla silver 
df_silver_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("dev.ciencias_data.silver_sessions")

#Visualizamos la tabla
df_silver_final.limit(20).display()

timestamp,part_hour,src_ip,dst_ip,src_port,dst_port,src_mac,dst_mac,protocol,tot_packets,src_packets,dst_packets,tot_bytes,src_bytes,dst_bytes,tot_data_bytes,first_packet,last_packet,src_geo,dst_geo,http,packet_len_suma,packet_len_min,packet_len_max,packet_len_media
2020-03-23 22:19:38.003,2020-03-23-22,192.151.112.163,189.203.246.14,56349,10003,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(http, tcp)",28,16,12,3758,2236,1522,1045,2020-03-24T04:19:32.196Z,2020-03-24T04:19:32.472Z,US,MX,List(List(189.203.246.14)),4206,0,708,145.0344827586207
2020-03-23 18:20:07.002,2020-03-23-18,201.140.104.57,66.110.49.32,41240,443,List(44:2b:03:53:d7:80),List(2c:21:72:d0:78:f6),List(tcp),6,4,2,396,264,132,0,2020-03-24T00:20:01.920Z,2020-03-24T00:20:01.984Z,MX,CA,,492,0,82,70.28571428571429
2020-03-23 22:19:39.005,2020-03-23-22,49.51.84.154,201.140.104.1,0,0,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(icmp),4,2,2,424,212,212,0,2020-03-24T04:19:28.783Z,2020-03-24T04:19:28.783Z,US,MX,,488,0,122,97.6
2020-03-23 18:20:08.001,2020-03-23-18,10.33.130.66,52.109.2.20,49732,443,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),List(tcp),6,4,2,360,240,120,0,2020-03-24T00:20:02.729Z,2020-03-24T00:20:02.730Z,,US,,456,0,76,65.14285714285714
2020-03-23 18:20:09.002,2020-03-23-18,187.151.20.154,201.140.104.13,50271,443,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),14,8,6,890,530,360,0,2020-03-24T00:20:03.382Z,2020-03-24T00:20:03.407Z,MX,MX,,1114,0,101,74.26666666666667
2020-03-23 22:19:39.005,2020-03-23-22,10.10.16.197,10.33.255.98,41517,161,List(00:08:e3:ff:fc:28),List(00:c8:8b:d3:ba:cd),"List(udp, snmp)",2,1,1,178,87,91,162,2020-03-24T04:19:08.542Z,2020-03-24T04:19:08.542Z,,,,210,0,107,70.0
2020-03-23 18:20:09.004,2020-03-23-18,10.33.224.109,23.3.183.64,54867,443,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(tls, tcp)",28,13,15,12897,1537,11360,5894,2020-03-24T00:20:03.286Z,2020-03-24T00:20:03.359Z,,US,List(List(www.sensacine.com.mx)),13345,0,1530,460.1724137931034
2020-03-23 22:19:40.004,2020-03-23-22,187.237.231.13,201.140.104.13,58906,80,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),58,26,32,37818,5672,32146,0,2020-03-24T04:17:29.623Z,2020-03-24T04:19:34.401Z,MX,MX,,38746,0,1430,656.7118644067797
2020-03-23 18:20:11.001,2020-03-23-18,2806:104e:16:1d32:71e6:ee76:a9eb:64bf,2801:c4:15:200::84,55303,443,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),10,6,4,802,506,296,0,2020-03-24T00:20:05.014Z,2020-03-24T00:20:05.316Z,MX,MX,,962,0,121,87.45454545454545
2020-03-23 22:19:40.004,2020-03-23-22,10.0.53.229,10.0.6.113,19390,514,List(00:08:e3:ff:fc:28),List(0c:c4:7a:fb:e3:b8),"List(tls, tcp)",23,12,11,8869,4206,4663,7353,2020-03-24T04:19:34.634Z,2020-03-24T04:19:34.668Z,,,List(List(fortinet-ca2.fortinet.com)),9237,0,1430,384.875


### 3. Diseñe un proceso que realice una carga incremental y cargue los datos de session_part2.csv en la tabla anteriormente creada, para ello debera aplicar las mismas transformaciones y agregaciones realizadas para la parte 1.

**Solución:** 
Para hacer la carga incremental, vamos a leer el segundo archivo (`sessions_part2.csv`). Primero le agregaremos la partición por hora y luego le aplicaremos exactamente la misma secuencia de transformaciones que usamos en el paso anterior (el parseo con la UDF, el casteo de fechas, los cálculos estadísticos de `packetLen` y los nombres en snake_case). 

La clave de este paso es que al momento de guardar el dataframe en nuestra tabla silver, usaremos el modo `append` en lugar de `overwrite`. Esto hará que los datos nuevos se sumen a los que ya teníamos de la parte 1.

In [0]:
# Leemos el segundo archivo csv
df_part2 = spark.read.format("csv").option("sep", "|").option("header", "true").load("/Volumes/dev/ciencias_data/session_data/sessions_part2.csv")

# Agregamos la partición por hora
df_bronce_p2 = df_part2.withColumn("part_hour", F.date_format(F.to_timestamp("timestamp"), "yyyy-MM-dd-HH"))

# Parseamos el JSON con la UDF que ya teníamos registrada arriba
df_parseado_p2 = df_bronce_p2.withColumn("data_parsed", udf_parseo(F.col("data")))
df_parseado_p2 = df_parseado_p2.filter(F.size(F.col("data_parsed")) > 0)
df_explotado_p2 = df_parseado_p2.select("timestamp", "part_hour", F.explode(F.col("data_parsed")).alias("columna_json"))

# Aplanamos la estructura
df_estructurado_p2 = df_explotado_p2.select("timestamp", "part_hour", "columna_json.*")

# Transformaciones de tipos y packetLen (igual que en la parte 1)
df_tipos_p2 = df_estructurado_p2.withColumn("totDataBytes", F.when(F.col("totDataBytes") == "", None).otherwise(F.col("totDataBytes")).cast(LongType()))
df_tipos_p2 = df_tipos_p2.withColumn("firstPacket", (F.col("firstPacket") / 1000).cast(TimestampType()))
df_tipos_p2 = df_tipos_p2.withColumn("lastPacket", (F.col("lastPacket") / 1000).cast(TimestampType()))

df_tipos_p2 = df_tipos_p2.withColumn("tamano_array", F.size(F.col("packetLen")))
df_tipos_p2 = df_tipos_p2.withColumn("packet_len_suma", F.expr("aggregate(packetLen, 0L, (acc, x) -> acc + x)"))
df_tipos_p2 = df_tipos_p2.withColumn("packet_len_min", F.array_min(F.col("packetLen")))
df_tipos_p2 = df_tipos_p2.withColumn("packet_len_max", F.array_max(F.col("packetLen")))
df_tipos_p2 = df_tipos_p2.withColumn("packet_len_media", F.col("packet_len_suma") / F.col("tamano_array"))

df_transformado_p2 = df_tipos_p2.drop("packetLen", "tamano_array")

# Pasamos las columnas a snake_case usando nuestra funcion de python
df_silver_p2 = df_transformado_p2
for columna in df_silver_p2.columns:
    df_silver_p2 = df_silver_p2.withColumnRenamed(columna, convertir_snake_case(columna))

# Guardamos en la misma tabla pero con modo APPEND para la carga incremental
df_silver_p2.write.format("delta").mode("append").saveAsTable("dev.ciencias_data.silver_sessions")

# Cargamos nuestra tabla silver completa
df_silver = spark.sql("SELECT * FROM dev.ciencias_data.silver_sessions")

#Visualizamos la tabla
df_silver.limit(20).display()

timestamp,part_hour,src_ip,dst_ip,src_port,dst_port,src_mac,dst_mac,protocol,tot_packets,src_packets,dst_packets,tot_bytes,src_bytes,dst_bytes,tot_data_bytes,first_packet,last_packet,src_geo,dst_geo,http,packet_len_suma,packet_len_min,packet_len_max,packet_len_media
2020-03-23 18:05:49.004,2020-03-23-18,192.151.112.163,187.190.36.232,60547,10001,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(http, tcp)",24,14,10,3188,1502,1686,874,2020-03-24T00:05:43.067Z,2020-03-24T00:05:43.716Z,US,MX,List(null),3572,0,613,142.88
2020-03-23 23:04:44.006,2020-03-23-23,201.140.104.57,172.217.5.174,48752,443,List(44:2b:03:53:d7:80),List(2c:21:72:d0:78:f6),List(tcp),12,6,6,824,412,412,0,2020-03-24T05:04:28.884Z,2020-03-24T05:04:38.913Z,MX,US,,1016,0,90,78.15384615384616
2020-03-23 18:05:49.005,2020-03-23-18,10.10.16.197,199.7.91.13,37999,53,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(udp, dns)",2,2,0,232,232,0,216,2020-03-24T00:05:18.661Z,2020-03-24T00:05:18.662Z,,US,,264,0,132,88.0
2020-03-23 23:04:45.004,2020-03-23-23,2806:104e:13:3c81:2588:909e:ebf3:80e9,2801:c4:15:200::84,50730,80,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),"List(http, tcp)",346,118,228,332472,15550,316922,97900,2020-03-24T05:04:29.980Z,2020-03-24T05:04:39.287Z,MX,MX,List(null),338008,0,1522,974.086455331412
2020-03-23 18:05:49.005,2020-03-23-18,10.10.18.92,10.10.16.209,54946,443,List(00:50:56:a7:5b:eb),List(00:08:e3:ff:fc:28),"List(tls, tcp)",12,7,5,1398,946,452,676,2020-03-24T00:05:43.843Z,2020-03-24T00:05:43.846Z,,,,1590,0,587,122.3076923076923
2020-03-23 23:04:45.005,2020-03-23-23,10.10.72.2,10.3.68.119,0,0,List(00:08:e3:ff:fc:28),List(00:af:1f:60:9a:cd),List(icmp),2,1,1,130,65,65,0,2020-03-24T05:04:34.119Z,2020-03-24T05:04:34.120Z,,,,162,0,81,54.0
2020-03-23 18:05:49.005,2020-03-23-18,3.224.34.30,201.140.104.57,443,61069,List(2c:21:72:d0:78:f6),List(44:2b:03:53:d7:80),List(tcp),10,4,6,722,326,396,0,2020-03-24T00:05:39.527Z,2020-03-24T00:05:43.849Z,US,MX,,882,0,113,80.18181818181819
2020-03-23 18:05:55.002,2020-03-23-18,10.10.18.88,10.10.18.11,0,0,"List(00:08:e3:ff:fc:28, 00:0c:29:68:d5:9a)","List(20:04:0f:f0:9d:d8, 00:08:e3:ff:fc:28)",List(icmp),4,2,2,680,340,340,0,2020-03-24T00:05:44.413Z,2020-03-24T00:05:44.413Z,,,,744,0,186,148.8
2020-03-23 23:04:46.005,2020-03-23-23,10.33.181.44,10.1.0.74,5246,5246,"List(90:6c:ac:42:83:ac, 40:ce:24:8a:ea:80)","List(40:ce:24:8a:ea:80, 00:09:0f:09:03:11)",List(udp),4,2,2,1132,634,498,1100,2020-03-24T05:04:15.830Z,2020-03-24T05:04:15.833Z,,,,1196,0,333,239.2
2020-03-23 18:05:55.003,2020-03-23-18,10.10.16.197,192.36.148.17,47928,53,List(00:08:e3:ff:fc:28),List(00:09:0f:09:02:08),"List(udp, dns)",1,1,0,102,102,0,94,2020-03-24T00:05:24.196Z,2020-03-24T00:05:24.196Z,,SE,,118,0,118,59.0


## 4. Una vez que tenga la tabla silver estructurada realice lo siguiente:

**Obtenga el número de sesiones por países destino y países origen.**

**Solución:** 
Usaremos las columnas `src_geo` y `dst_geo`. Solo necesitamos agrupar por ambos campos, contarlos y ordenarlos de mayor a menor para ver las rutas más comunes.

In [0]:
df_paises = df_silver.groupBy("src_geo", "dst_geo").count().orderBy(F.desc("count"))
df_paises.limit(20).display()

src_geo,dst_geo,count
,,11211
,US,4349
US,MX,2696
MX,MX,2579
MX,US,661
US,,599
FR,,570
MX,,269
US,US,254
,MX,251


**Obtenga el número de sesiones por srcIP y dstIP, así también totBytes, totDataBytes y totPackets por srcIP y Protocolo.**

**Solución:**
Esta pregunta nos pide dos cosas distintas, así que haremos dos cálculos en esta celda:
1. Primero, agrupamos por IP de origen (`src_ip`) y de destino (`dst_ip`) y contamos cuántas sesiones hay entre ellas.
2. Segundo, para los totales por protocolo, como la columna `protocol` es un arreglo, usamos `explode` para separar cada protocolo en una fila. Luego agrupamos por la IP de origen y el protocolo individual para sumar los bytes y paquetes.

In [0]:
# Parte 1: Sesiones por IPs
df_ips = df_silver.groupBy("src_ip", "dst_ip").count().orderBy(F.desc("count"))

print("1. Número de sesiones por IP Origen y Destino:")
df_ips.limit(20).display()

# Parte 2: Totales por IP y Protocolo
# Separamos el arreglo de protocolos
df_protocolos = df_silver.withColumn("protocolo_ind", F.explode(F.col("protocol")))

# Agrupamos y sumamos
df_metricas_proto = df_protocolos.groupBy("src_ip", "protocolo_ind").agg(F.sum("tot_bytes"), F.sum("tot_data_bytes"), F.sum("tot_packets"))

print("2. Totales por SrcIP y Protocolo:")
df_metricas_proto.limit(20).display()

1. Número de sesiones por IP Origen y Destino:


src_ip,dst_ip,count
10.1.0.93,10.0.6.113,632
2.5.5.2,10.128.0.1,543
10.10.16.197,10.33.191.126,432
10.10.16.197,10.1.0.77,319
192.151.112.162,187.188.92.141,299
192.151.112.163,189.203.246.14,298
10.34.91.250,10.6.1.240,224
192.151.112.162,187.189.183.202,217
10.33.174.233,104.16.107.144,173
192.151.112.162,187.188.87.225,171


2. Totales por SrcIP y Protocolo:


src_ip,protocolo_ind,sum(tot_bytes),sum(tot_data_bytes),sum(tot_packets)
10.10.8.22,snmp,44924,41276,456
10.8.131.9,tcp,150836,61878,494
10.33.191.126,snmp,5034,4778,32
fe80::3492:9d40:4bd8:a889,udp,26374,24150,278
10.33.181.69,udp,4528,4400,16
10.65.118.2,tcp,176370,47351,641
10.33.191.80,tcp,173274,44595,1224
172.35.168.6,tcp,378441,96062,1649
10.33.167.199,udp,2480,2304,22
10.33.120.175,tls,703662,193110,2278


**Obtenga el totBytes, totDataBytes y totPackets por srcMac y dstMac.**

**Solución:**
Las columnas de MAC (`src_mac` y `dst_mac`) son arreglos porque los paquetes pasan por varios dispositivos. Spark no deja agrupar por arreglos, así que primero los convertimos a texto separándolos por comas (`concat_ws`) y luego ya podemos agrupar y sumar los totales.

In [0]:
# Convertimos los arreglos a texto para poder agrupar
df_macs_texto = df_silver.withColumn("src_mac_txt", F.concat_ws(",", F.col("src_mac")))
df_macs_texto = df_macs_texto.withColumn("dst_mac_txt", F.concat_ws(",", F.col("dst_mac")))

df_totales_mac = df_macs_texto.groupBy("src_mac_txt", "dst_mac_txt").agg(F.sum("tot_bytes"), F.sum("tot_data_bytes"), F.sum("tot_packets"))

df_totales_mac.limit(20).display()

src_mac_txt,dst_mac_txt,sum(tot_bytes),sum(tot_data_bytes),sum(tot_packets)
00:09:0f:09:00:09,00:09:0f:09:02:09,14767708,14548753,22649
54:e1:ad:6d:59:12,01:00:5e:00:00:fc,600,536,8
00:08:e3:ff:fc:28,00:23:ac:05:4c:c1,35518,30626,274
"00:08:e3:ff:fc:28,00:0c:29:0f:67:f4","00:08:e3:ff:fc:28,00:09:0f:09:02:08",67578,20558,268
"40:ce:24:8a:ea:80,00:09:0f:09:03:10","68:ca:e4:df:b0:d4,40:ce:24:8a:ea:80",57302,50966,342
00:15:99:7f:da:71,01:00:5e:7f:ff:fa,33706,32946,95
00:08:e3:ff:fc:28,00:09:0f:09:14:22,31995603,102285,41087
68:ca:e4:df:b0:d4,40:ce:24:8a:ea:80,536,488,6
04:d5:90:04:34:c2,00:25:46:09:bd:c1,11951,6869,120
6c:2b:59:ed:de:5f,33:33:00:01:00:03,168,152,2


**Obtenga los valores mínimo, máximo y promedio para los totBytes, totDataBytes y totPackets por srcIp, dstIp, srcMac y dstMac**

**Solución:**
Vamos a usar el dataframe anterior que ya tiene las MACs como texto (`df_macs_texto`). Agrupamos por las 4 columnas que nos piden (IPs y MACs) y calculamos el `min`, `max` y `avg` (promedio) de cada métrica.

In [0]:
df_estadisticas = df_macs_texto.groupBy("src_ip", "dst_ip", "src_mac_txt", "dst_mac_txt").agg(
    F.min("tot_bytes"), F.max("tot_bytes"), F.avg("tot_bytes"),
    F.min("tot_data_bytes"), F.max("tot_data_bytes"), F.avg("tot_data_bytes"),
    F.min("tot_packets"), F.max("tot_packets"), F.avg("tot_packets")
)

df_estadisticas.limit(20).display()

src_ip,dst_ip,src_mac_txt,dst_mac_txt,min(tot_bytes),max(tot_bytes),avg(tot_bytes),min(tot_data_bytes),max(tot_data_bytes),avg(tot_data_bytes),min(tot_packets),max(tot_packets),avg(tot_packets)
10.40.130.27,172.217.7.42,"00:08:e3:ff:fc:28,00:af:1f:60:9a:cd","00:08:e3:ff:fc:28,00:09:0f:09:02:08",17616,17616,17616.0,17424,17424,17424.0,24,24,24.0
10.10.16.202,10.20.1.253,00:08:e3:ff:fc:28,00:7e:95:e1:ea:46,379,379,379.0,0,0,0.0,1,1,1.0
10.10.16.197,10.33.191.117,00:09:0f:09:03:11,40:ce:24:8a:ea:80,176,176,176.0,160,160,160.0,2,2,2.0
10.33.170.206,96.45.33.73,6c:0b:84:66:7c:e4,e8:1c:ba:f7:5f:72,368,368,368.0,336,336,336.0,4,4,4.0
10.10.16.197,10.33.255.106,00:08:e3:ff:fc:28,40:55:39:af:8c:c1,88,2335,218.8771929824561,80,2239,201.6140350877193,1,12,2.1578947368421053
2806:2f0:9005:d41b:99e2:9516:53f7:e0fe,2801:c4:15:200::20,2c:21:72:d0:78:f6,44:2b:03:53:d7:80,8248,8248,8248.0,2560,2560,2560.0,36,36,36.0
10.33.225.70,172.217.9.3,00:08:e3:ff:fc:28,00:09:0f:09:02:08,612,612,612.0,0,0,0.0,10,10,10.0
10.9.132.196,224.0.0.252,6c:2b:59:ef:b0:5b,01:00:5e:00:00:fc,144,146,145.0,128,130,129.0,2,2,2.0
10.10.8.22,10.6.1.236,00:08:e3:ff:fc:28,28:80:23:af:91:a6,980,980,980.0,0,0,0.0,10,10,10.0
172.35.168.29,72.21.81.240,"00:08:e3:ff:fc:28,00:7e:95:e1:ea:46","00:08:e3:ff:fc:28,00:09:0f:09:02:08",5580,5580,5580.0,1320,1320,1320.0,27,27,27.0


**Haga un top 5 de srcIp, srcMac con mayor número de sesiones.**

**Solución:**
Haremos dos conteos rápidos. Uno agrupando solo por la IP de origen y otro por la MAC de origen (usando la versión de texto). Ordenamos de mayor a menor y limitamos a los primeros 5 resultados.

In [0]:
print("Top 5 srcIP:")
df_silver.groupBy("src_ip").count().orderBy(F.desc("count")).limit(5).display()

print("Top 5 srcMac:")
df_macs_texto.groupBy("src_mac_txt").count().orderBy(F.desc("count")).limit(5).display()

Top 5 srcIP:


src_ip,count
10.10.16.197,3096
10.33.176.35,1125
192.151.112.162,1037
192.151.112.163,936
10.1.0.93,632


Top 5 srcMac:


src_mac_txt,count
00:08:e3:ff:fc:28,7333
2c:21:72:d0:78:f6,3158
44:2b:03:53:d7:80,1406
"40:ce:24:8a:ea:80,d0:27:88:42:f4:8a",899
"00:08:e3:ff:fc:28,00:af:1f:60:9a:cd",761


**Cuente el número de srcMac, dstMac involucradas de cada sesión, es decir el tamaño del array de esos campos.**

**Solución:**
Simplemente usamos la función `size` sobre las columnas originales que tienen los arreglos para saber cuántas MACs hay en cada lista.

In [0]:
df_tamano_macs = df_silver.select("src_ip", "dst_ip", F.size(F.col("src_mac")).alias("conteo_src_mac"), F.size(F.col("dst_mac")).alias("conteo_dst_mac"))
df_tamano_macs.limit(20).display()

src_ip,dst_ip,conteo_src_mac,conteo_dst_mac
192.151.112.163,187.190.36.232,1,1
201.140.104.57,172.217.5.174,1,1
10.10.16.197,199.7.91.13,1,1
2806:104e:13:3c81:2588:909e:ebf3:80e9,2801:c4:15:200::84,1,1
10.10.18.92,10.10.16.209,1,1
10.10.72.2,10.3.68.119,1,1
3.224.34.30,201.140.104.57,1,1
10.10.18.88,10.10.18.11,2,2
10.33.181.44,10.1.0.74,2,2
10.10.16.197,192.36.148.17,1,1


**¿Cuáles son los protocolos de red más usados?**

**Solución:**
Usamos de nuevo el `explode` en la columna `protocol` para contar cada uno por separado, ordenamos por cantidad y mostramos los más frecuentes.

In [0]:
df_protocolos_top = df_silver.select(F.explode(F.col("protocol")).alias("protocolo")).groupBy("protocolo").count().orderBy(F.desc("count"))
df_protocolos_top.limit(20).display()

protocolo,count
tcp,12133
udp,10771
tls,4771
snmp,4372
http,4072
dns,2868
icmp,2773
llmnr,686
ssdp,392
ldap,273


**¿Cuáles son las páginas web más visitadas?**

**Solución:**
El host está dentro de la estructura `http`. Primero filtramos para quedarnos solo con los registros que no sean nulos (que sí sean tráfico web) y luego explotamos el arreglo `http.host` para contar las páginas individuales.

In [0]:
# Filtramos nulos y explotamos
df_webs = df_silver.filter(F.col("http.host").isNotNull()).select(F.explode(F.col("http.host")).alias("pagina_web"))

df_webs_top = df_webs.groupBy("pagina_web").count().orderBy(F.desc("count"))
df_webs_top.limit(20).display()


pagina_web,count
j5yrru.manage.trendmicro.com,519
187.189.183.202,352
187.188.92.141,292
189.203.246.14,248
9gag.com,227
chat2.sat.gob.mx,217
189.207.250.121,206
sat.gob.mx,169
elb-nvi-amz.nimbus.bitdefender.net,166
172.16.239.8,160
