In [229]:
import os
import pandas as pd
import numpy as np
import json
import pickle
from datetime import datetime as dt, timedelta

import matplotlib.pyplot as plt
import geopy.distance
import gmplot 
import time

from ipywidgets import interact
import ipywidgets as widgets

from IPython.display import IFrame

In [8]:
BASE_PATH = os.getcwd()

In [226]:
class test_person():
    '''
    "test_filename": 'Test.json' | "infected_filename": 'Infected.json' | "plot": False |
    "cutoff_days": 14 | "drop_columns": True | "pickle_dataset": False | "load_test_pickle": False | "test_pickle_name": 'test.pickle' | "load_infected_pickle": False
    "max_score": 100.0 | "infected_pickle_name": 'infected.pickle' | "safe_distance": 20.0
    "resultMap_name": 'result.HTML'
    
    
    This class helps in testing a user for chances of COVID-19 (or any other communicable disease) based on the user's GPS history which needs to be compared with GPS history
    of people confirmed with the infection.
    
    '''
    def __init__(self,**kwargs):
        
        print('Initializing Arguments...\n')
        default_args = {
                         "test_filename": 'Test.json', 
                         "infected_filename": 'Infected.json',
                         "plot": False,
                         "pathMap_name": 'pathMap.HTML',
                         "cutoff_days": 14,
                         "drop_columns": True,
                         "pickle_dataset": False,
                         "load_test_pickle": False,
                         "test_pickle_name": 'test.pickle',
                         "safe_distance": 20.0,
                         "max_score": 100.0,
                         "load_infected_pickle": False,
                         "infected_pickle_name": 'infected.pickle',
                         "resultMap_name": 'result.HTML'
                        }

        for (arg,default) in default_args.items():
            setattr(self, arg, kwargs.get(arg,default))
        
        # need to read infected dataset as pickle not available
        if(not self.load_infected_pickle):
            tick = time.time()
            self.infected_df = self.loadData()
            tock = time.time()
            print('Time taken to read and transform Infected Dataset: {} seconds\n'.format((tock-tick)))
            self.load_infected_pickle = True
        else:
            # loading infected pickled file
            print('Loading infected pickled file...')
            with open(os.path.join(BASE_PATH,self.infected_pickle_name), 'rb') as handle:
                self.infected_df = pickle.load(handle)
                print('Completed\n')
        
        if(not self.load_test_pickle):
            tick = time.time()
            self.df = self.loadData()
            tock = time.time()
            print('Time taken to read and transform Test Dataset: {} seconds\n'.format((tock-tick)))
        else:
            # loading test pickled file
            print('Loading test pickled file...')
            with open(os.path.join(BASE_PATH,self.test_pickle_name), 'rb') as handle:
                self.df = pickle.load(handle)
                print('Completed\n')
        
        self.result = 'Please call the test function to calculate the result!'
    
    def loadData(self):
        
        # loading infected dataset
        if(not self.load_infected_pickle):
            print('Reading Infected Dataset...\nThis may take a while depending on the size of the dataset!')
            file = os.path.join(BASE_PATH,'data\\{}'.format(self.infected_filename))
            data = pd.read_json(file)
            print('Read complete')
        # loading test dataset
        else:
            print('Reading Test Dataset. This may take a while depending on the size of the dataset!')
            file = os.path.join(BASE_PATH,'data\\{}'.format(self.test_filename))
            data = pd.read_json(file)
            print('Read complete')
        
        # printing only when reading test dataset
        if(self.load_infected_pickle):
            print('There are total {:,} rows in your location history'.format(len(data)))

        # parsing timestamp
        data.loc[:,'timestamp_ms'] = data['locations'].map(lambda x: x['timestampMs'])
        data.loc[:,'timestamp_ms'] = data['timestamp_ms'].astype(float)/1000
        
        # convertin timestamp to datetime and parsing date
        data.loc[:,'datetime'] = data['timestamp_ms'].map(lambda x: dt.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
        data.loc[:,'datetime'] = data['datetime'].apply(lambda x: dt.strptime(x,'%Y-%m-%d %H:%M:%S'))
        data.loc[:,'date'] = data['datetime'].apply(lambda x: x.date())

        cutoff_date = (dt.today() - timedelta(days=self.cutoff_days)).date()

        cutoff_data = data[data['date'] >= cutoff_date].reset_index(drop = True)

        # parse lat, lon, and timestamp from the dict inside the locations column
        cutoff_data.loc[:,'lat'] = cutoff_data['locations'].map(lambda x: x['latitudeE7'])
        cutoff_data.loc[:,'lon'] = cutoff_data['locations'].map(lambda x: x['longitudeE7'])

        # convert lat/lon to decimalized degrees
        cutoff_data.loc[:,'lat'] = cutoff_data['lat'] / 10.**7
        cutoff_data.loc[:,'lon'] = cutoff_data['lon'] / 10.**7
        
        # convert time of day in minutes
        cutoff_data.loc[:,'timeMinutes'] = cutoff_data['datetime'].apply(lambda x: ((x.hour*60)+(x.minute)+(x.second/60)))
        
        # bucket time in 1 minute intervals and keep only one record per interval on a day
        cutoff_data.loc[:,'timeBucket'] = pd.cut(cutoff_data['timeMinutes'],bins=range(0,1441,1))
        cutoff_data.drop_duplicates(['date','timeBucket'],inplace=True)
        
        if(self.drop_columns):
            print('Unwanted columns dropped.')
            cutoff_data.drop(columns=['locations','timestamp_ms'],inplace=True)

        if(self.load_infected_pickle):
            print('There are {:,} rows in the cutoff dataset'.format(len(cutoff_data)))
            print('Start Date: {}\nEnd Date: {}'.format(dt.strftime(min(cutoff_data['date']),'%d %b,%Y'),dt.strftime(max(cutoff_data['date']),'%d %b,%Y')))
        
        if(self.pickle_dataset):
            
            if(not self.load_infected_pickle):
                with open(self.infected_pickle_name, 'wb') as handle:
                    pickle.dump(cutoff_data, handle)
                    print('Infected dataset pickled successfully.')
            else:
                with open(self.test_pickle_name, 'wb') as handle:
                    pickle.dump(cutoff_data, handle)
                    print('Test dataset pickled successfully.')

        return(cutoff_data)
    
    def test(self):
        
        merged_df = self.infected_df.merge(self.df,how='inner',on=['date','timeBucket'],suffixes=('_infected','_test'))
        merged_df.loc[:,'distance'] = merged_df.apply(lambda row: geopy.distance.distance((row['lat_infected'],row['lon_infected']),(row['lat_test'],row['lon_test'])).m,axis=1)
        
        dateTime_df = merged_df[merged_df['distance'] < self.safe_distance].groupby(['date','timeBucket','lat_test','lon_test']).aggregate({'distance':'mean'}).dropna().reset_index()
        dateTime_df.loc[:,'score'] = dateTime_df['distance'].apply(lambda x: 1/(x+0.1))
        
        result = dateTime_df.groupby(['date'])['score'].sum().reset_index()
        result.loc[:,'probability'] = result['score']/self.max_score
        
        self.result = result[['date']].merge(dateTime_df,how='left',on=['date'])
        self.result.loc[:,'time'] = self.result['timeBucket'].apply(lambda x: str(int(x.left/60)) + ':' + str(x.left%60))
        
        result.loc[:,'date'] = result['date'].apply(lambda x: dt.strftime(x,'%d %b,%Y')) 
        print('You may have come in contact with an infected person on the following dates: {}'.format(list(result['date'].values)))
        print('Your infection probability: {:.3}% '.format(result['probability'].sum()*100))

    
    def plotPath(self):
        
        @interact
        def plot(date=self.df['date'].unique().tolist()):
            
            if(type(date) == list):
                plotData = self.df[self.df['date'] == date[0]]
            else:
                plotData = self.df[self.df['date'] == date]
            center_lat = plotData['lat'].mean()  
            center_lon = plotData['lon'].mean()
            gmap = gmplot.GoogleMapPlotter(center_lat,center_lon,zoom=12)

            gmap.scatter(plotData['lat'], plotData['lon'], '# FF0000',size = 40, marker = False)
            gmap.plot(plotData['lat'], plotData['lon'],'cornflowerblue', edge_width = 2.5) 
            gmap.draw(os.path.join(BASE_PATH,self.pathMap_name))

            return(IFrame(src='./{}'.format(self.pathMap_name), width=800, height=400))
        
        plot()
        
    def plotResult(self):
        
        @interact
        def plot(date = self.result['date'].unique().tolist(),time = self.result['time'].unique().tolist()):
            
            if(type(date) == list):
                plotData = self.result[self.result['date'] == date[0]][self.result['time'] == time[0]]
            else:
                plotData = self.result[self.result['date'] == date][self.result['time'] == time]
            
            center_lat = self.result['lat_test'].mean()  
            center_lon = self.result['lon_test'].mean()
                                                                    
            gmap = gmplot.GoogleMapPlotter(center_lat,center_lon,zoom=16)
            
            gmap.scatter(plotData['lat_test'], plotData['lon_test'],'red',size = 10, marker = False)
            gmap.draw(os.path.join(BASE_PATH,self.resultMap_name))

            return(IFrame(src='./{}'.format(self.resultMap_name), width=800, height=400))
        
        plot()

In [19]:
# vb = test_person(pickle_dataset=True)

Initializing Arguments...
Reading Infected Dataset...
This may take a while depending on the size of the dataset!
Read complete.
Time taken to read Infected Dataset: 380.2379813194275 seconds
Reading Test Dataset. This may take a while depending on the size of the dataset!
Read complete.
There are total 764,848 rows in your location history
There are 296 rows in the cutoff dataset
Start Date: 07 Mar,2020
End Date: 20 Mar,2020
Time taken to read Test Dataset: 63.825989723205566 seconds


In [227]:
vb = test_person(load_test_pickle = True,load_infected_pickle=True)

Initializing Arguments...

Loading infected pickled file...
Completed

Loading test pickled file...
Completed



In [228]:
vb.plotPath()

interactive(children=(Dropdown(description='date', options=(datetime.date(2020, 3, 7), datetime.date(2020, 3, …

In [217]:
vb.test()

You may have come in contact with an infected person on the following dates: ['07 Mar,2020']
Your infection probability: 0.185% 


In [218]:
vb.result

Unnamed: 0,date,timeBucket,lat_test,lon_test,distance,score,time
0,2020-03-07,"(1352, 1353]",28.454372,77.049336,8.244519,0.119839,22:32
1,2020-03-07,"(1409, 1410]",28.45458,77.04929,15.321866,0.064843,23:29


In [219]:
vb.plotResult()

interactive(children=(Dropdown(description='date', options=(datetime.date(2020, 3, 7),), value=datetime.date(2…