# The Big Data Challenge

# Import packages

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import requests


# Stringency data by country

In [31]:
# Read in stringency data
file = (
    "Big Data Challenge_ Data and Challenge Statements/Data/Stringency data/OxCGRT_latest_hackathon.csv"
)
stringency = pd.read_csv(file, parse_dates=['Date'], usecols=['CountryName', 'Date', 
                                                              'StringencyIndex', 'ConfirmedCases',
                                                             'ConfirmedDeaths'])
print(stringency.head())

  CountryName       Date  ConfirmedCases  ConfirmedDeaths  StringencyIndex
0       Aruba 2020-01-01             NaN              NaN              0.0
1       Aruba 2020-01-02             NaN              NaN              0.0
2       Aruba 2020-01-03             NaN              NaN              0.0
3       Aruba 2020-01-04             NaN              NaN              0.0
4       Aruba 2020-01-05             NaN              NaN              0.0


In [32]:
# Check for missing values
print(stringency.isna().sum())

CountryName            0
Date                   0
ConfirmedCases     10426
ConfirmedDeaths    10426
StringencyIndex     1382
dtype: int64


## Reformat data

In [33]:
# Create new columns with Year and Month
stringency.loc[:,'Year'] = stringency.loc[:,'Date'].dt.year
stringency.loc[:,'Month'] = stringency.loc[:,'Date'].dt.month_name()
stringency.loc[:,'day'] = stringency.loc[:,'Date'].dt.day
print(stringency.head())


  CountryName       Date  ConfirmedCases  ConfirmedDeaths  StringencyIndex  \
0       Aruba 2020-01-01             NaN              NaN              0.0   
1       Aruba 2020-01-02             NaN              NaN              0.0   
2       Aruba 2020-01-03             NaN              NaN              0.0   
3       Aruba 2020-01-04             NaN              NaN              0.0   
4       Aruba 2020-01-05             NaN              NaN              0.0   

   Year    Month  day  
0  2020  January    1  
1  2020  January    2  
2  2020  January    3  
3  2020  January    4  
4  2020  January    5  


# Electricity data for the UK

In [34]:
# Read in electricity data
file3 = ("Big Data Challenge_ Data and Challenge Statements/Data/Electricity data/ET_5.5_AUG_20.xls")

electricity = pd.read_excel(file3, sheet_name='Month', usecols=['YEAR', 'MONTH', 'electricity '], header=[7], skiprows= [i for i in range(8, 273) ])
print(electricity)

                       YEAR      MONTH  electricity 
0                      2017    January     29.860327
1                      2017   February     26.207024
2                      2017      March     26.369292
3                      2017      April     24.043366
4                      2017        May     24.157889
5                      2017       June     22.589158
6                      2017       July     23.254473
7                      2017     August     23.576351
8                      2017  September     23.683497
9                      2017    October     25.338122
10                     2017   November     27.786430
11                     2017   December     29.302651
12                     2018    January     28.582218
13                     2018   February     27.061782
14                     2018      March     29.225837
15                     2018      April     25.640237
16                     2018        May     23.697955
17                     2018       June     22.

In [35]:
# Rename electicity column
electricity = electricity.rename(columns={'electricity ':'Total consumption of electricity used (all providers) TWh', 'YEAR':'Year', 'MONTH':'Month'})
print(electricity.head())

   Year     Month  Total consumption of electricity used (all providers) TWh
0  2017   January                                          29.860327        
1  2017  February                                          26.207024        
2  2017     March                                          26.369292        
3  2017     April                                          24.043366        
4  2017       May                                          24.157889        


In [36]:
# Remove rows containing nan in a specific column
electricity = electricity[pd.notnull(electricity['Month'])]

In [37]:
# Check for missing values
print(electricity.isna().sum())

Year                                                         0
Month                                                        0
Total consumption of electricity used (all providers) TWh    0
dtype: int64


In [38]:
## Create a date column so that we can merge on date - use 15th just as median date 
## NOTE THIS IS NOT A REAL DATE, only MONTH AND YEAR are correct
electricity['day'] = '15'
# Combine day, month and year of birth into one column
electricity['Date'] =  electricity['Year'].astype(str).str.zfill(4) + "-" + electricity['Month'].astype(str).str.zfill(2) + "-" + electricity['day'].astype(str).str.zfill(2)
# Convert date of birth column into datetime
electricity['Date'] = electricity['Date'].astype('datetime64[ns]')

print(electricity['Date'])


0    2017-01-15
1    2017-02-15
2    2017-03-15
3    2017-04-15
4    2017-05-15
5    2017-06-15
6    2017-07-15
7    2017-08-15
8    2017-09-15
9    2017-10-15
10   2017-11-15
11   2017-12-15
12   2018-01-15
13   2018-02-15
14   2018-03-15
15   2018-04-15
16   2018-05-15
17   2018-06-15
18   2018-07-15
19   2018-08-15
20   2018-09-15
21   2018-10-15
22   2018-11-15
23   2018-12-15
24   2019-01-15
25   2019-02-15
26   2019-03-15
27   2019-04-15
28   2019-05-15
29   2019-06-15
30   2019-07-15
31   2019-08-15
32   2019-09-15
33   2019-10-15
34   2019-11-15
35   2019-12-15
36   2020-01-15
37   2020-02-15
38   2020-03-15
39   2020-04-15
40   2020-05-15
41   2020-06-15
Name: Date, dtype: datetime64[ns]


In [39]:
stringency.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42920 entries, 0 to 42919
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   CountryName      42920 non-null  object        
 1   Date             42920 non-null  datetime64[ns]
 2   ConfirmedCases   32494 non-null  float64       
 3   ConfirmedDeaths  32494 non-null  float64       
 4   StringencyIndex  41538 non-null  float64       
 5   Year             42920 non-null  int64         
 6   Month            42920 non-null  object        
 7   day              42920 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(2), object(2)
memory usage: 2.6+ MB


In [40]:
print(electricity.shape)
print(stringency.shape)

(42, 5)
(42920, 8)


##  Merge country data 

In [41]:
stringency.day = stringency.day.astype(str)

country_data = electricity.merge(stringency, on='Date', how='outer')
print(country_data)
country_data.to_csv("Big Data Challenge_ Data and Challenge Statements/Data/country_data.csv")

      Year_x   Month_x  \
0       2017   January   
1       2017  February   
2       2017     March   
3       2017     April   
4       2017       May   
...      ...       ...   
42951    NaN       NaN   
42952    NaN       NaN   
42953    NaN       NaN   
42954    NaN       NaN   
42955    NaN       NaN   

       Total consumption of electricity used (all providers) TWh day_x  \
0                                              29.860327            15   
1                                              26.207024            15   
2                                              26.369292            15   
3                                              24.043366            15   
4                                              24.157889            15   
...                                                  ...           ...   
42951                                                NaN           NaN   
42952                                                NaN           NaN   
42953                

# Road traffic data by city

In [42]:
# Read in traffic data from ONS
file2 = (
    "Traffic_ONS/10septembertrafficcamerasdataset.xlsx"
)
traffic = pd.read_excel(file2, sheet_name='Seasonally adjusted', header=[0, 1], skiprows=[2, 3], index_col=0)
print(traffic.head())

Daily counts London                                                      \
               Cars Motorbikes Buses Trucks Vans Pedestrians & cyclists   
2020-03-01        *          *     *      *    *                      *   
2020-03-02        *          *     *      *    *                      *   
2020-03-03        *          *     *      *    *                      *   
2020-03-04        *          *     *      *    *                      *   
2020-03-05        *          *     *      *    *                      *   

Daily counts North East                          ... Greater Manchester  \
                   Cars Motorbikes Buses Trucks  ...              Buses   
2020-03-01        55378         18  2249   1130  ...                  *   
2020-03-02        54773         24  2279   1159  ...                  *   
2020-03-03        56045         29  2283   1168  ...                  *   
2020-03-04        53209         20  2262   1150  ...                  *   
2020-03-05        52897 

## Handle missing data

In [None]:
# Replace the asterisk with empty cells
traffic = traffic.replace('*', '')
print(traffic)

## Reformat data

In [None]:
# Melt traffic data
long_traffic = pd.melt(traffic, var_name=['City','Transport mode'], value_name='Count', ignore_index=False)
print(long_traffic)
long_traffic = pd.DataFrame(long_traffic.reset_index())

# Rename date column
long_traffic = long_traffic.rename(columns={"index": "Date"})
print(long_traffic)

In [None]:
# Create new columns with Year and Month
long_traffic.loc[:,'Year'] = long_traffic.loc[:,'Date'].dt.year
long_traffic.loc[:,'Month'] = long_traffic.loc[:,'Date'].dt.month
print(long_traffic.head())


long_traffic.loc[:,'Year'] = long_traffic.loc[:,'Year'].astype(int)
long_traffic.loc[:, 'Month'] = long_traffic.loc[:, 'Month'].astype(int)
print(long_traffic.info())

In [None]:
## Check it can plot - just for de-bugging
sns.lineplot(
    data = long_traffic,
    x = 'Month',
    y = 'Count',
    aspect=3
)

# Air quality

In [47]:
# Read in the air quality data 
file_air = ("Big Data Challenge_ Data and Challenge Statements/Data/Air Quality /LaqnData.csv")

ND_Lewisham = pd.read_csv(file_air)
print(ND_Lewisham)

     Site Species   ReadingDateTime  Value   Units Provisional or Ratified
0     LW1     NO2  01/01/2017 00:00   31.8  ug m-3                       R
1     LW1     NO2  02/01/2017 00:00   40.4  ug m-3                       R
2     LW1     NO2  03/01/2017 00:00   65.8  ug m-3                       R
3     LW1     NO2  04/01/2017 00:00   45.8  ug m-3                       R
4     LW1     NO2  05/01/2017 00:00  107.8  ug m-3                       R
...   ...     ...               ...    ...     ...                     ...
5470  HP1     NO2  27/12/2019 00:00   33.7  ug m-3                       P
5471  HP1     NO2  28/12/2019 00:00   28.0  ug m-3                       P
5472  HP1     NO2  29/12/2019 00:00   25.8  ug m-3                       P
5473  HP1     NO2  30/12/2019 00:00   40.4  ug m-3                       P
5474  HP1     NO2  31/12/2019 00:00   41.5  ug m-3                       P

[5475 rows x 6 columns]
