In [14]:
import boto3
import os,sys
import datetime as dt
import glob
import pandas as pd
pd.set_option('display.max_rows', 1000)
import hvplot.pandas
import holoviews as hv
import subprocess as sp

In [15]:
os.makedirs('/dev/shm/logs',exist_ok=True)
os.chdir('/dev/shm/logs')

In [20]:
!aws s3 sync s3://earthbigdata/logs/ /dev/shm/logs --quiet

In [21]:
def make_entry(ip_dict,date_dict,logfile):  
    with open(logfile,"r") as f:
        lines=f.readlines()
    date = lines[0].split('[')[1].split('+')[0].strip()
    date = dt.datetime.strptime(date,'%d/%b/%Y:%H:%M:%S')
    ipaddr=lines[0].split(']')[1].split('-')[0].strip()
    if not ipaddr in ip_dict:
        ip_dict[ipaddr]=[]
    if not date.date() in date_dict:
        date_dict[date.date()]=0
    ip_dict[ipaddr].append(date)
    date_dict[date.date()]+=1
    

In [22]:
ip_dict={}
date_dict={}

for log in glob.glob('ebd-covid19*'):
    make_entry(ip_dict,date_dict,log)

In [23]:
access=pd.DataFrame.from_dict(date_dict,orient='index')

In [24]:
access.sort_index(inplace=True,ascending=False)

In [25]:
access.hvplot.barh(padding=0.1,grid=True,ylabel='Access Times per day',xlabel='',title='EBD COVID 19 Access')

In [26]:
access.head()

Unnamed: 0,0
2020-04-14,700
2020-04-13,216
2020-04-12,236
2020-04-11,219
2020-04-10,255


In [27]:
ip_count={}
for i in ip_dict:
    ip_count[i]=len(ip_dict[i])

In [28]:
access_count=pd.DataFrame.from_dict(ip_count,orient='index')

access_count=access_count[access_count>=5].dropna()

access_count = access_count.sort_values(0,ascending=False)

access_count['nslookup']=''
for i in access_count.index:
    cmd='nslookup {}'.format(i)
    try:
        ret= sp.check_output(cmd.split())
        nslookup = ret.decode().split('name = ')[1].split('\n')[0].strip('.')
        # print(nslookup)
        access_count.loc[i,'nslookup']=nslookup
    except:
        pass

In [29]:
access_count.head(40)

Unnamed: 0,0,nslookup
24.218.227.219,850.0,c-24-218-227-219.hsd1.ma.comcast.net
52.44.93.197,229.0,ec2-52-44-93-197.compute-1.amazonaws.com
34.232.127.140,181.0,ec2-34-232-127-140.compute-1.amazonaws.com
34.231.157.157,146.0,ec2-34-231-157-157.compute-1.amazonaws.com
107.193.48.19,134.0,107-193-48-19.lightspeed.sndgca.sbcglobal.net
93.218.246.5,128.0,p5DDAF605.dip0.t-ipconnect.de
73.23.120.180,112.0,c-73-23-120-180.hsd1.fl.comcast.net
98.250.114.207,101.0,c-98-250-114-207.hsd1.mi.comcast.net
108.218.159.111,100.0,108-218-159-111.lightspeed.irvnca.sbcglobal.net
72.205.80.22,96.0,ip72-205-80-22.sb.sd.cox.net


In [30]:
len(list(ip_dict.keys()))

3088

In [31]:
list(ip_dict.keys())[:10]

['38.126.101.143',
 '174.196.202.168',
 '107.193.48.19',
 '75.67.167.199',
 '66.249.83.56',
 '100.37.154.209',
 '38.106.159.21',
 '130.50.200.1',
 '24.218.225.203',
 '66.249.83.48']

In [32]:
for i in list(ip_dict.keys()):
    if ip_dict[i][-1].date() == dt.datetime.now().date():
        if i in list(access_count.index):
            print(ip_dict[i][-1].date(),access_count.loc[i][0],'\t',access_count.loc[i].nslookup)

2020-04-14 5.0 	 
2020-04-14 6.0 	 c-75-67-167-199.hsd1.ma.comcast.net
2020-04-14 18.0 	 c-73-38-72-25.hsd1.ma.comcast.net
2020-04-14 11.0 	 c-73-49-152-8.hsd1.fl.comcast.net
2020-04-14 8.0 	 c-71-192-100-250.hsd1.ma.comcast.net
2020-04-14 10.0 	 c-73-227-60-125.hsd1.ma.comcast.net
2020-04-14 13.0 	 c-71-192-101-137.hsd1.ma.comcast.net
2020-04-14 5.0 	 70-59-18-109.hlrn.qwest.net
2020-04-14 17.0 	 c-75-67-137-71.hsd1.ma.comcast.net
2020-04-14 6.0 	 c-174-63-127-91.hsd1.ma.comcast.net
2020-04-14 5.0 	 mobile-166-137-102-099.mycingular.net
2020-04-14 6.0 	 c-75-67-160-137.hsd1.ma.comcast.net
2020-04-14 11.0 	 c-73-227-226-51.hsd1.ma.comcast.net
2020-04-14 13.0 	 c-67-169-169-137.hsd1.ca.comcast.net
2020-04-14 5.0 	 c-73-186-210-20.hsd1.ma.comcast.net
2020-04-14 7.0 	 c-71-192-100-75.hsd1.ma.comcast.net
2020-04-14 6.0 	 c-24-63-44-165.hsd1.ma.comcast.net
2020-04-14 12.0 	 c-75-67-160-210.hsd1.ma.comcast.net
2020-04-14 7.0 	 pool-98-109-33-131.nwrknj.fios.verizon.net
2020-04-14 5.0 	 c-73-

In [33]:
access_list=[]
for i in ip_dict:
    last_access=ip_dict[i][-1].date()
    if i in list(access_count.index):
        a = int(access_count.loc[i][0])
        who = access_count.loc[i].nslookup
        access_list.append((last_access,a,i,who))
access_list.sort(reverse=True)
for i in access_list:
    print('{} {:5d}  {:17s} {}'.format(*i))

2020-04-14    18  73.38.72.25       c-73-38-72-25.hsd1.ma.comcast.net
2020-04-14    17  75.67.137.71      c-75-67-137-71.hsd1.ma.comcast.net
2020-04-14    13  71.192.101.137    c-71-192-101-137.hsd1.ma.comcast.net
2020-04-14    13  67.169.169.137    c-67-169-169-137.hsd1.ca.comcast.net
2020-04-14    13  166.137.12.124    mobile-166-137-012-124.mycingular.net
2020-04-14    12  75.67.160.210     c-75-67-160-210.hsd1.ma.comcast.net
2020-04-14    11  73.49.152.8       c-73-49-152-8.hsd1.fl.comcast.net
2020-04-14    11  73.227.226.51     c-73-227-226-51.hsd1.ma.comcast.net
2020-04-14    10  73.227.60.125     c-73-227-60-125.hsd1.ma.comcast.net
2020-04-14    10  184.102.117.148   184-102-117-148.cltn.qwest.net
2020-04-14     9  24.63.44.205      c-24-63-44-205.hsd1.ma.comcast.net
2020-04-14     8  73.159.110.168    c-73-159-110-168.hsd1.ma.comcast.net
2020-04-14     8  71.192.100.250    c-71-192-100-250.hsd1.ma.comcast.net
2020-04-14     7  98.109.33.131     pool-98-109-33-131.nwrknj.fios.ve