In [30]:
import boto3
import os,sys
import datetime as dt
import glob
import pandas as pd
pd.set_option('display.max_rows', 1000)
import hvplot.pandas
import holoviews as hv
import subprocess as sp

In [31]:
os.makedirs('/dev/shm/logs',exist_ok=True)
os.chdir('/dev/shm/logs')

In [32]:
!aws s3 sync s3://earthbigdata/logs/ /dev/shm/logs --quiet

In [33]:
def make_entry(ip_dict,date_dict,logfile):  
    with open(logfile,"r") as f:
        lines=f.readlines()
    date = lines[0].split('[')[1].split('+')[0].strip()
    date = dt.datetime.strptime(date,'%d/%b/%Y:%H:%M:%S')
    ipaddr=lines[0].split(']')[1].split('-')[0].strip()
    if not ipaddr in ip_dict:
        ip_dict[ipaddr]=[]
    if not date.date() in date_dict:
        date_dict[date.date()]=0
    ip_dict[ipaddr].append(date)
    date_dict[date.date()]+=1
    

In [34]:
ip_dict={}
date_dict={}

for log in glob.glob('ebd-covid19*'):
    make_entry(ip_dict,date_dict,log)

In [35]:
access=pd.DataFrame.from_dict(date_dict,orient='index')

In [36]:
access.sort_index(inplace=True,ascending=False)

In [37]:
access.hvplot.barh(padding=0.1,grid=True,ylabel='Access Times per day',xlabel='',title='EBD COVID 19 Access')

In [38]:
ip_count={}
for i in ip_dict:
    ip_count[i]=len(ip_dict[i])

In [39]:
access_count=pd.DataFrame.from_dict(ip_count,orient='index')

access_count=access_count[access_count>=5].dropna()

access_count = access_count.sort_values(0,ascending=False)

access_count['nslookup']=''
for i in access_count.index:
    cmd='nslookup {}'.format(i)
    try:
        ret= sp.check_output(cmd.split())
        nslookup = ret.decode().split('name = ')[1].split('\n')[0].strip('.')
        # print(nslookup)
        access_count.loc[i,'nslookup']=nslookup
    except:
        pass

In [40]:
access_count.head(40)

Unnamed: 0,0,nslookup
24.218.227.219,502.0,c-24-218-227-219.hsd1.ma.comcast.net
52.44.93.197,131.0,ec2-52-44-93-197.compute-1.amazonaws.com
73.23.120.180,112.0,c-73-23-120-180.hsd1.fl.comcast.net
34.231.157.157,111.0,ec2-34-231-157-157.compute-1.amazonaws.com
34.232.127.140,106.0,ec2-34-232-127-140.compute-1.amazonaws.com
93.218.246.5,71.0,p5DDAF605.dip0.t-ipconnect.de
71.232.190.220,64.0,c-71-232-190-220.hsd1.ma.comcast.net
72.205.80.22,61.0,ip72-205-80-22.sb.sd.cox.net
107.193.48.19,59.0,107-193-48-19.lightspeed.sndgca.sbcglobal.net
68.48.240.2,58.0,c-68-48-240-2.hsd1.mi.comcast.net


In [41]:
len(list(ip_dict.keys()))

2211

In [42]:
list(ip_dict.keys())[:10]

['50.105.116.149',
 '107.5.111.149',
 '66.188.172.228',
 '76.80.178.3',
 '173.63.26.215',
 '3.86.107.205 arn:aws:iam::811071659227:user/josefk 6B42681EE02A1ADE REST.PUT.OBJECT index.html "PUT /ebd',
 '196.52.2.78',
 '128.149.248.89',
 '108.7.205.117',
 '68.36.48.51']

In [43]:
for i in list(ip_dict.keys()):
    if ip_dict[i][-1].date() == dt.datetime.now().date():
        if i in list(access_count.index):
            print(ip_dict[i][-1].date(),access_count.loc[i][0],'\t',access_count.loc[i].nslookup)

2020-04-02 9.0 	 
2020-04-02 6.0 	 66-188-172-228.dhcp.stcd.mn.charter.com
2020-04-02 16.0 	 pool-108-7-205-117.bstnma.fios.verizon.net
2020-04-02 6.0 	 c-68-36-48-51.hsd1.mi.comcast.net
2020-04-02 10.0 	 bwhmaincampuspat5.partners.org
2020-04-02 5.0 	 c-73-145-131-117.hsd1.mi.comcast.net
2020-04-02 6.0 	 162-201-0-35.lightspeed.livnmi.sbcglobal.net
