In [51]:
import boto3
import os,sys
import datetime as dt
import glob
import pandas as pd
pd.set_option('display.max_rows', 1000)
import hvplot.pandas
import holoviews as hv
import subprocess as sp

In [52]:
os.makedirs('/dev/shm/logs',exist_ok=True)
os.chdir('/dev/shm/logs')

In [53]:
!aws s3 sync s3://earthbigdata/logs/ /dev/shm/logs --quiet

In [54]:
def make_entry(ip_dict,date_dict,logfile):  
    with open(logfile,"r") as f:
        lines=f.readlines()
    date = lines[0].split('[')[1].split('+')[0].strip()
    date = dt.datetime.strptime(date,'%d/%b/%Y:%H:%M:%S')
    ipaddr=lines[0].split(']')[1].split('-')[0].strip()
    if not ipaddr in ip_dict:
        ip_dict[ipaddr]=[]
    if not date.date() in date_dict:
        date_dict[date.date()]=0
    ip_dict[ipaddr].append(date)
    date_dict[date.date()]+=1
    

In [55]:
ip_dict={}
date_dict={}

for log in glob.glob('ebd-covid19*'):
    make_entry(ip_dict,date_dict,log)

In [56]:
access=pd.DataFrame.from_dict(date_dict,orient='index')

In [57]:
access.sort_index(inplace=True,ascending=False)

In [58]:
access.hvplot.barh(padding=0.1,grid=True,ylabel='Access Times per day',xlabel='',title='EBD COVID 19 Access')

In [43]:
ip_count={}
for i in ip_dict:
    ip_count[i]=len(ip_dict[i])

In [44]:
access_count=pd.DataFrame.from_dict(ip_count,orient='index')

access_count=access_count[access_count>=5].dropna()

access_count = access_count.sort_values(0,ascending=False)

access_count['nslookup']=''
for i in access_count.index:
    cmd='nslookup {}'.format(i)
    try:
        ret= sp.check_output(cmd.split())
        nslookup = ret.decode().split('name = ')[1].split('\n')[0].strip('.')
        # print(nslookup)
        access_count.loc[i,'nslookup']=nslookup
    except:
        pass

In [45]:
access_count.head(40)

Unnamed: 0,0,nslookup
24.218.227.219,536.0,c-24-218-227-219.hsd1.ma.comcast.net
52.44.93.197,141.0,ec2-52-44-93-197.compute-1.amazonaws.com
73.23.120.180,112.0,c-73-23-120-180.hsd1.fl.comcast.net
34.231.157.157,111.0,ec2-34-231-157-157.compute-1.amazonaws.com
34.232.127.140,106.0,ec2-34-232-127-140.compute-1.amazonaws.com
93.218.246.5,71.0,p5DDAF605.dip0.t-ipconnect.de
72.205.80.22,65.0,ip72-205-80-22.sb.sd.cox.net
71.232.190.220,64.0,c-71-232-190-220.hsd1.ma.comcast.net
107.193.48.19,61.0,107-193-48-19.lightspeed.sndgca.sbcglobal.net
73.143.234.232,60.0,c-73-143-234-232.hsd1.ma.comcast.net


In [46]:
len(list(ip_dict.keys()))

2267

In [47]:
list(ip_dict.keys())[:10]

['72.205.80.22',
 '173.252.95.43',
 '3.86.107.205 arn:aws:iam::811071659227:user/josefk C525E534DD5C55F7 REST.PUT.OBJECT index.html "PUT /ebd',
 '24.218.227.219',
 '67.171.185.11',
 '76.80.178.3',
 '96.35.76.104',
 '66.220.149.11',
 '68.48.240.2',
 '98.250.114.207']

In [48]:
for i in list(ip_dict.keys()):
    if ip_dict[i][-1].date() == dt.datetime.now().date():
        if i in list(access_count.index):
            print(ip_dict[i][-1].date(),access_count.loc[i][0],'\t',access_count.loc[i].nslookup)

In [49]:
access.tail()

Unnamed: 0,0
2020-03-24,721
2020-03-23,1745
2020-03-22,637
2020-03-21,800
2020-03-20,277


In [50]:
ip_dict

{'72.205.80.22': [datetime.datetime(2020, 4, 3, 0, 47, 27),
  datetime.datetime(2020, 4, 2, 15, 33, 25),
  datetime.datetime(2020, 4, 2, 15, 33, 25),
  datetime.datetime(2020, 4, 2, 15, 33, 32),
  datetime.datetime(2020, 3, 30, 22, 24, 41),
  datetime.datetime(2020, 3, 30, 22, 24, 40),
  datetime.datetime(2020, 3, 30, 22, 25, 16),
  datetime.datetime(2020, 3, 30, 19, 38, 56),
  datetime.datetime(2020, 3, 30, 19, 38, 56),
  datetime.datetime(2020, 3, 29, 22, 11, 23),
  datetime.datetime(2020, 3, 29, 22, 11, 38),
  datetime.datetime(2020, 3, 29, 22, 7, 38),
  datetime.datetime(2020, 3, 29, 22, 11, 23),
  datetime.datetime(2020, 3, 29, 22, 7, 38),
  datetime.datetime(2020, 3, 29, 5, 24, 33),
  datetime.datetime(2020, 3, 29, 5, 26, 18),
  datetime.datetime(2020, 3, 29, 5, 26, 19),
  datetime.datetime(2020, 3, 29, 5, 24, 32),
  datetime.datetime(2020, 3, 28, 14, 16, 22),
  datetime.datetime(2020, 3, 28, 14, 16, 21),
  datetime.datetime(2020, 3, 28, 14, 16, 30),
  datetime.datetime(2020, 3, 