# Package used in analysis download

In [1]:
# Data reading
import pandas as pd 

# Directory manipulation
import os

# Date wrangling
import datetime 

# Newest data download

In [2]:
# Downloading the newest data 
!python3 dataDownload.py

Read municipality data in 0.62 seconds
Rows read: 13613
Read patient data in 0.89 seconds
Rows read: 31872
Data saved in data/2020-11-13


# Reading the newest data 

In [3]:
# Listing all the directories in the data folder 
dataFolders = os.listdir('data')

# Converting the folder names to datetime. That way when we sort we can be sure that the 
# newest date will be the last coordinate
dataFolders = [datetime.datetime.strptime(x, '%Y-%m-%d') for x in dataFolders]

# Sorting 
dataFolders.sort()

# Extracting the newest date 
newestDate = dataFolders[-1].strftime('%Y-%m-%d')

# Reading the newest files
dMunicipality = pd.read_csv(f'data/{newestDate}/municipality_data.csv')
dPatient = pd.read_csv(f'data/{newestDate}/patient_data.csv')

# Data cleaning and engineering 

## Municipality data 

In [4]:
print(dMunicipality.shape)

(13613, 7)


In [5]:
print(dMunicipality.head())

          day administrative_level_3  tests_negative  tests_positive  \
0  2020-03-19             Alytaus m.               2               0   
1  2020-03-19              Elektrėnų               1               0   
2  2020-03-19               Kauno m.              20               0   
3  2020-03-19               Kauno r.               1               0   
4  2020-03-19           Panevėžio r.               1               0   

   tests_positive_repeated  tests_positive_new  tests_total  
0                        0                   0            2  
1                        0                   0            1  
2                        0                   0           20  
3                        0                   0            1  
4                        0                   0            1  


In [6]:
print(dMunicipality.dtypes)

day                        object
administrative_level_3     object
tests_negative              int64
tests_positive              int64
tests_positive_repeated     int64
tests_positive_new          int64
tests_total                 int64
dtype: object


In [7]:
# Converting the day column to date format 
dMunicipality['day'] = [datetime.datetime.strptime(x, '%Y-%m-%d').date() for x in dMunicipality['day']]

In [8]:
# Sorting the data frame by the date 
dMunicipality.sort_values('day', inplace=True)
dMunicipality.reset_index(inplace=True, drop=True)

# Geting unique day count 
uniqueDates = list(set(dMunicipality['day']))

In [9]:
print(f'Number of unique days in the dataset: {len(uniqueDates)}')
print(f'First day in data: {min(uniqueDates)}')
print(f'Newest day in data: {max(uniqueDates)}')

Number of unique days in the dataset: 237
First day in data: 2020-03-19
Newest day in data: 2020-11-10


In [10]:
# Geting unique municipality names
uniqueMun = list(set(dMunicipality['administrative_level_3']))

In [11]:
print(f'Number of unique municipalities in the dataset: {len(uniqueMun)}')

Number of unique municipalities in the dataset: 60
