In [1]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\Users\\stoic\\Desktop\\spark-3.2.0-bin-hadoop3.2'

In [2]:
#importing all the required libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import FloatType, StringType
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from datetime import datetime
from geopy.geocoders import Nominatim
import pycountry_convert as pc
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.regression import LinearRegression
import numpy as np
import pandas as pd
import time

spark = SparkSession.builder.appName("query2").getOrCreate()

# Preaparing the Dataset
Before applying the linear regression on monthly increase in the no of cases we need to prepare the dataset accordingly. Following steps have been taken for adtaset preparation:

1. Conversion of the dates into single format of mm/dd/yyyy and renaming all the columns of the dataframe using datetime library and 'col' function from pyspark.
2. Creating a new Column 'States_New' Column which contains the state or otherwise country.
3. Calculating the daily increasse in cases by subtaracting the cummulative cases of consecutive dates by transposing the dataframe in pandas and creating index against each date to solve the time series analysis.

In [3]:
start_time = time.time()
df = spark.read.csv('time_series_covid19_confirmed_global.csv',header = True)
df.columns

['Province/State',
 'Country/Region',
 'Lat',
 'Long',
 '1/22/20',
 '1/23/20',
 '1/24/20',
 '1/25/20',
 '1/26/20',
 '1/27/20',
 '1/28/20',
 '1/29/20',
 '1/30/20',
 '1/31/20',
 '02/01/2020',
 '02/02/2020',
 '02/03/2020',
 '02/04/2020',
 '02/05/2020',
 '02/06/2020',
 '02/07/2020',
 '02/08/2020',
 '02/09/2020',
 '02/10/2020',
 '02/11/2020',
 '02/12/2020',
 '2/13/20',
 '2/14/20',
 '2/15/20',
 '2/16/20',
 '2/17/20',
 '2/18/20',
 '2/19/20',
 '2/20/20',
 '2/21/20',
 '2/22/20',
 '2/23/20',
 '2/24/20',
 '2/25/20',
 '2/26/20',
 '2/27/20',
 '2/28/20',
 '2/29/20',
 '03/01/2020',
 '03/02/2020',
 '03/03/2020',
 '03/04/2020',
 '03/05/2020',
 '03/06/2020',
 '03/07/2020',
 '03/08/2020',
 '03/09/2020',
 '03/10/2020',
 '03/11/2020',
 '03/12/2020',
 '3/13/20',
 '3/14/20',
 '3/15/20',
 '3/16/20',
 '3/17/20',
 '3/18/20',
 '3/19/20',
 '3/20/20',
 '3/21/20',
 '3/22/20',
 '3/23/20',
 '3/24/20',
 '3/25/20',
 '3/26/20',
 '3/27/20',
 '3/28/20',
 '3/29/20',
 '3/30/20',
 '3/31/20',
 '04/01/2020',
 '04/02/2020',
 '0

In [4]:
#converting the all the date columns into a common format dd/mm/yyyy

date_list = [_ for _ in df.columns if _ not in ['Province/State','Country/Region','Lat','Long']]
date_column_names = []
for dates in date_list:
    try:
        if datetime.strptime(dates, '%m/%d/%y'):
            dt = datetime.strptime(dates, '%m/%d/%y')
            dts = dt.strftime('%m/%d/%Y')
            date_column_names.append(dts)
    except:
        date_column_names.append(dates)
column_names = dict(zip(date_list,date_column_names))
column_names['Province/State']='State'
column_names['Country/Region']='Country'
column_names['Lat']='Lat'
column_names['Long']='Long'

df = df.select([F.col(c).alias(column_names.get(c, c)) for c in df.columns])
df.columns

['State',
 'Country',
 'Lat',
 'Long',
 '01/22/2020',
 '01/23/2020',
 '01/24/2020',
 '01/25/2020',
 '01/26/2020',
 '01/27/2020',
 '01/28/2020',
 '01/29/2020',
 '01/30/2020',
 '01/31/2020',
 '02/01/2020',
 '02/02/2020',
 '02/03/2020',
 '02/04/2020',
 '02/05/2020',
 '02/06/2020',
 '02/07/2020',
 '02/08/2020',
 '02/09/2020',
 '02/10/2020',
 '02/11/2020',
 '02/12/2020',
 '02/13/2020',
 '02/14/2020',
 '02/15/2020',
 '02/16/2020',
 '02/17/2020',
 '02/18/2020',
 '02/19/2020',
 '02/20/2020',
 '02/21/2020',
 '02/22/2020',
 '02/23/2020',
 '02/24/2020',
 '02/25/2020',
 '02/26/2020',
 '02/27/2020',
 '02/28/2020',
 '02/29/2020',
 '03/01/2020',
 '03/02/2020',
 '03/03/2020',
 '03/04/2020',
 '03/05/2020',
 '03/06/2020',
 '03/07/2020',
 '03/08/2020',
 '03/09/2020',
 '03/10/2020',
 '03/11/2020',
 '03/12/2020',
 '03/13/2020',
 '03/14/2020',
 '03/15/2020',
 '03/16/2020',
 '03/17/2020',
 '03/18/2020',
 '03/19/2020',
 '03/20/2020',
 '03/21/2020',
 '03/22/2020',
 '03/23/2020',
 '03/24/2020',
 '03/25/2020',
 

In [5]:
df=df.withColumn('States_New',when(df.State.isNull(),df.Country).otherwise(df.State))
df=df.dropDuplicates(['States_New'])

In [6]:
#typecasting the date columns into integer type
date_list = [_ for _ in df.columns if _ not in ['State','Country','Lat','Long','States_New']]
for column in date_list:    
    df = df.withColumn(column,df[column].cast('Integer'))
    
df.printSchema()

root
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Lat: string (nullable = true)
 |-- Long: string (nullable = true)
 |-- 01/22/2020: integer (nullable = true)
 |-- 01/23/2020: integer (nullable = true)
 |-- 01/24/2020: integer (nullable = true)
 |-- 01/25/2020: integer (nullable = true)
 |-- 01/26/2020: integer (nullable = true)
 |-- 01/27/2020: integer (nullable = true)
 |-- 01/28/2020: integer (nullable = true)
 |-- 01/29/2020: integer (nullable = true)
 |-- 01/30/2020: integer (nullable = true)
 |-- 01/31/2020: integer (nullable = true)
 |-- 02/01/2020: integer (nullable = true)
 |-- 02/02/2020: integer (nullable = true)
 |-- 02/03/2020: integer (nullable = true)
 |-- 02/04/2020: integer (nullable = true)
 |-- 02/05/2020: integer (nullable = true)
 |-- 02/06/2020: integer (nullable = true)
 |-- 02/07/2020: integer (nullable = true)
 |-- 02/08/2020: integer (nullable = true)
 |-- 02/09/2020: integer (nullable = true)
 |-- 02/10/2020: integer (nulla

In [7]:
#Calculating daily increase in the number of cases and appending them in the same column and transposing the dataset
df_pd = df.toPandas()
df_pd0 = df_pd.iloc[:,-1]
df_pd1 = df_pd.iloc[:,:5]
df_pd2 = df_pd.iloc[:,4:-1].diff(axis=1)
df_pd2=df_pd2.drop(columns=['01/22/2020'])
df_concat = pd.concat([df_pd0,df_pd1,df_pd2],axis =1, sort=False)
psdf = df_concat.transpose()
psdf=psdf.reset_index()
header_row = 0
psdf.columns = psdf.iloc[header_row]
psdf = psdf.drop(header_row)
psdf=psdf.drop([1,2,3,4])
psdf=psdf.reset_index()
psdf.rename(columns={'index': 'Date_index'}, inplace=True)
psdf.rename(columns = {'States_New':'Dates'}, inplace = True)
psdf['Date_index'] = psdf.index
for i in psdf.columns[2:]:
    psdf=psdf.astype({i: 'int64'})
psdf=psdf.astype({'Dates': 'string'})
display(psdf)

  df[column_name] = series
  """


Unnamed: 0,Date_index,Dates,Afghanistan,Albania,Alberta,Algeria,Andorra,Angola,Anguilla,Anhui,...,Wallis and Futuna,West Bank and Gaza,Western Australia,Xinjiang,Yemen,Yukon,Yunnan,Zambia,Zhejiang,Zimbabwe
0,0,01/22/2020,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,10,0
1,1,01/23/2020,0,0,0,0,0,0,0,8,...,0,0,0,2,0,0,1,0,17,0
2,2,01/24/2020,0,0,0,0,0,0,0,6,...,0,0,0,0,0,0,3,0,16,0
3,3,01/25/2020,0,0,0,0,0,0,0,24,...,0,0,0,1,0,0,6,0,19,0
4,4,01/26/2020,0,0,0,0,0,0,0,21,...,0,0,0,1,0,0,5,0,42,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,662,11/14/2021,42,475,0,97,0,14,0,0,...,0,0,0,0,0,0,5,12,0,35
663,663,11/15/2021,58,194,1068,134,88,0,0,0,...,0,522,1,0,18,97,6,13,1,10
664,664,11/16/2021,39,587,321,141,22,27,38,0,...,0,236,3,0,0,0,4,12,0,67
665,665,11/17/2021,90,616,412,135,43,28,0,0,...,0,87,0,0,11,44,2,12,0,52


In [8]:
df_sp= spark.createDataFrame(psdf)
df_sp.printSchema()

root
 |-- Date_index: long (nullable = true)
 |-- Dates: string (nullable = true)
 |-- Afghanistan: long (nullable = true)
 |-- Albania: long (nullable = true)
 |-- Alberta: long (nullable = true)
 |-- Algeria: long (nullable = true)
 |-- Andorra: long (nullable = true)
 |-- Angola: long (nullable = true)
 |-- Anguilla: long (nullable = true)
 |-- Anhui: long (nullable = true)
 |-- Antigua and Barbuda: long (nullable = true)
 |-- Argentina: long (nullable = true)
 |-- Armenia: long (nullable = true)
 |-- Aruba: long (nullable = true)
 |-- Australian Capital Territory: long (nullable = true)
 |-- Austria: long (nullable = true)
 |-- Azerbaijan: long (nullable = true)
 |-- Bahamas: long (nullable = true)
 |-- Bahrain: long (nullable = true)
 |-- Bangladesh: long (nullable = true)
 |-- Barbados: long (nullable = true)
 |-- Beijing: long (nullable = true)
 |-- Belarus: long (nullable = true)
 |-- Belgium: long (nullable = true)
 |-- Belize: long (nullable = true)
 |-- Benin: long (nullable

# Calculating Daily Trendline Coefficients
The authors have used pyspark MLlib for calculating the linaer regression for all the states.

1. Conversion of Date_index into dense vectors with the help of udf implementation in spark.
2. Calculating the trendline coefficient for each state column as target column and and Date_index as predictor column using Pyspark Linear regression library.
3. Selecting the top 100 states and saving the results into csv file.

In [9]:
#Creation of UDF to convert column into dense vector
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
data = df_sp.withColumn("Feature", F.array("Date_index")).withColumn("Feature", to_vector("Feature"))
data.select(['Feature','Date_index']).show()

+-------+----------+
|Feature|Date_index|
+-------+----------+
|  [0.0]|         0|
|  [1.0]|         1|
|  [2.0]|         2|
|  [3.0]|         3|
|  [4.0]|         4|
|  [5.0]|         5|
|  [6.0]|         6|
|  [7.0]|         7|
|  [8.0]|         8|
|  [9.0]|         9|
| [10.0]|        10|
| [11.0]|        11|
| [12.0]|        12|
| [13.0]|        13|
| [14.0]|        14|
| [15.0]|        15|
| [16.0]|        16|
| [17.0]|        17|
| [18.0]|        18|
| [19.0]|        19|
+-------+----------+
only showing top 20 rows



In [10]:
#function for calulation of linear regression coefficients using Spark ML-Lib
def calculate_trendline_coefficient(data,target_column):
    
    """
    Description: This function takes a pyspark dataframe object and calculates the linear regression coefficient
    on the target column. 

    Args:
        data (pyspark dataframe object): The dataset which needs to be predictor('feature') and target column.
        target_column (string): The target column for the linear regression coefficient needs to be calculated.

    Returns:
        Vector: Returns a dense vector.
    """
    lin_reg = LinearRegression(featuresCol='Feature',labelCol=target_column, maxIter=10, regParam=0.3, elasticNetParam=0.8)
    lr_model = lin_reg.fit(data)
    return lr_model.coefficients
        
    

In [11]:
#calulation of trendline coefficient for each state
item_names = df.select(F.col('States_New')).collect()
state_list=df.select(['States_New']).toPandas()['States_New'].to_list()
coefficient_list = []
count =1
for state in state_list:
    print(count)
    coefficient_list.append([state, calculate_trendline_coefficient(data,state)])
    count+=1
coefficient_list 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


[['Chad', DenseVector([-0.0005])],
 ['Manitoba', DenseVector([0.2258])],
 ['Paraguay', DenseVector([1.591])],
 ['Russia', DenseVector([37.7305])],
 ['Anguilla', DenseVector([0.0118])],
 ['Yemen', DenseVector([0.0378])],
 ['Guangdong', DenseVector([-0.0116])],
 ['Senegal', DenseVector([0.2374])],
 ['Cabo Verde', DenseVector([0.111])],
 ['Sweden', DenseVector([2.4228])],
 ['Kiribati', DenseVector([0.0])],
 ['Hunan', DenseVector([-0.0104])],
 ['Shanxi', DenseVector([-0.001])],
 ['Guyana', DenseVector([0.2312])],
 ['Burma', DenseVector([3.4261])],
 ['Tibet', DenseVector([0.0])],
 ['Eritrea', DenseVector([0.0292])],
 ['Philippines', DenseVector([17.456])],
 ['Djibouti', DenseVector([0.0])],
 ['Tonga', DenseVector([0.0])],
 ['Malaysia', DenseVector([21.4787])],
 ['Singapore', DenseVector([1.7529])],
 ['Hubei', DenseVector([-0.8607])],
 ['Fiji', DenseVector([0.4709])],
 ['Turkey', DenseVector([44.5766])],
 ['Yukon', DenseVector([0.0118])],
 ['Tianjin', DenseVector([0.0])],
 ['Malawi', DenseVe

In [12]:
#Creating a pyspark dataframe which contains the respective coefficients
coeff_list = [x[1][0] for x in coefficient_list ]
country_list = [x[0] for x in coefficient_list]
df_pd = pd.DataFrame(list(zip(country_list, coeff_list)),columns =['States_New', 'Coefficient'])
df_pd = spark.createDataFrame(df_pd)

In [13]:
#Creating a new dataframe by joining and selecting only top 100 states
df_join=df_pd.join(df,['States_New']).orderBy(desc("Coefficient")).limit(100)
display(df_join.toPandas())

  df[column_name] = series


Unnamed: 0,States_New,Coefficient,State,Country,Lat,Long,01/22/2020,01/23/2020,01/24/2020,01/25/2020,...,11/09/2021,11/10/2021,11/11/2021,11/12/2021,11/13/2021,11/14/2021,11/15/2021,11/16/2021,11/17/2021,11/18/2021
0,US,123.238569,,US,40,-100,1,1,2,2,...,46716709,46811405,46866719,47012970,47053803,47083964,47222751,47310337,47421741,47531319
1,India,109.639824,,India,20.593684,78.96288,0,0,0,0,...,34388579,34401670,34414186,34426036,34437307,34447536,34456401,34466598,34478517,34489623
2,United Kingdom,52.895127,,United Kingdom,55.3781,-3.436,0,0,0,0,...,9366676,9406001,9448402,9487302,9524971,9561099,9600369,9637190,9675058,9721916
3,Brazil,48.584339,,Brazil,-14.235,-51.9253,0,0,0,0,...,21897025,21909298,21924598,21939196,21953838,21957967,21960766,21965684,21977661,21989962
4,Turkey,44.576636,,Turkey,38.9637,35.2433,0,0,0,0,...,8290135,8317394,8342292,8365929,8388512,8410136,8433988,8459089,8482956,8505190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Laos,0.671415,,Laos,19.85627,102.495496,0,0,0,0,...,48892,50031,50977,52175,53207,54192,55018,56324,57397,58798
96,North Macedonia,0.666628,,North Macedonia,41.6086,21.7453,0,0,0,0,...,206910,207566,207998,208467,208992,209277,209449,209971,210447,210938
97,Dominican Republic,0.658873,,Dominican Republic,18.7357,-70.1627,0,0,0,0,...,390058,391104,392223,393383,394608,395856,397016,398018,398880,399773
98,Montenegro,0.646582,,Montenegro,42.708678,19.37439,0,0,0,0,...,149729,150313,150793,151272,151730,152087,152411,152931,153334,153743


In [14]:
#Saving the results into a csv file
df_join.toPandas().to_csv("covid_trendline_coefficients.csv")

  df[column_name] = series


# Claculating Continent for Each State

1. The Authors have created a function which takes the values of latitude and longitude and returns the name continent using geopy and pycountry_convert library.
2. Using this function in udf to calculate the continent against each state in the pyspark dtaframe and appending the values in column 'Continent'.


In [3]:
#Function to calculate the Continent which takes latitude and longitude as input
def coordinates_to_continent(Latitude,Longitude):
    """
Description: This function takes a pyspark dataframe object and calculates the linear regression coefficient
on the target column.

Args:
Latitude (string): Latitude value of state or country
Longitude (string): Longitude value of state or country

Returns:
String: Name of the Continent['Asia','Europe','Africa','Americe','Oceania','Atarctica'].
"""
    try:
        geolocator = Nominatim(user_agent="abc")  
        location = geolocator.reverse(Latitude+","+Longitude)
        country_code = location.raw.get('address').get('country_code')
        country_continent_code = pc.country_alpha2_to_continent_code(country_code.upper())
        print(country_continent_code)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        if country_continent_name=='North America' or country_continent_name=='South America':
            return 'America'
        return country_continent_name
    except:
        return "Antarctica"

In [16]:
coordinates_to_continent("33.93911","67.709953")

AS


'Asia'

In [17]:
#creating udf to calculate the continent for each state
continent = udf(coordinates_to_continent, StringType())
df_join = df_join.withColumn('Continent', continent(df_join.Lat,df_join.Long))
df_join.select(['States_New','Continent']).show()

+--------------+---------+
|    States_New|Continent|
+--------------+---------+
|            US|  America|
|         India|     Asia|
|United Kingdom|   Europe|
|        Brazil|  America|
|        Turkey|     Asia|
|        Russia|   Europe|
|          Iran|     Asia|
|     Indonesia|     Asia|
|       Germany|   Europe|
|      Malaysia|     Asia|
|      Thailand|     Asia|
|        France|   Europe|
|   Philippines|     Asia|
|     Argentina|  America|
|       Ukraine|   Europe|
|      Colombia|  America|
|       Vietnam|     Asia|
|        Mexico|  America|
|         Japan|     Asia|
|         Spain|   Europe|
+--------------+---------+
only showing top 20 rows



# Calculation of weekly results

The authors have calculated mean, stansrad deviation, minimum and maximum cases based on daily increase per week for each continent in the following steps.

1. Calculation of daily increase in cases with the help of pandas.
2. Creating a dictionry which contains the dates for each week.
3. Calculating the mean, standard deviation, minimum and maximum value for each week and appending them in new column using pandas.
4. Grouping the dataset continent wise, and aggregating the dataset weekly to calculate weekly mean, standard_deviation, minimum and maximum values each week.
5. Saving the results to csv file.

In [18]:
#Calculating daily increase in the number of cases and appending them in the same column
df_join_pd = df_join.toPandas()
df_join_pd0 = df_join_pd.iloc[:,-1]
df_join_pd1 = df_join_pd.iloc[:,:7]
df_join_pd2 = df_join_pd.iloc[:,6:-1].diff(axis=1)
df_join_pd2=df_join_pd2.drop(columns=['01/22/2020'])
df_join_concat = pd.concat([df_join_pd0,df_join_pd1,df_join_pd2],axis =1, sort=False)
for i in df_join_concat.columns[7:]:
    df_join_concat=df_join_concat.astype({i: 'int64'})
display(df_join_concat)

  """


Unnamed: 0,Continent,States_New,Coefficient,State,Country,Lat,Long,01/22/2020,01/23/2020,01/24/2020,...,11/09/2021,11/10/2021,11/11/2021,11/12/2021,11/13/2021,11/14/2021,11/15/2021,11/16/2021,11/17/2021,11/18/2021
0,America,US,123.238569,,US,40,-100,1,0,1,...,80039,94696,55314,146251,40833,30161,138787,87586,111404,109578
1,Asia,India,109.639824,,India,20.593684,78.96288,0,0,0,...,11466,13091,12516,11850,11271,10229,8865,10197,11919,11106
2,Europe,United Kingdom,52.895127,,United Kingdom,55.3781,-3.436,0,0,0,...,32785,39325,42401,38900,37669,36128,39270,36821,37868,46858
3,America,Brazil,48.584339,,Brazil,-14.235,-51.9253,0,0,0,...,10948,12273,15300,14598,14642,4129,2799,4918,11977,12301
4,Asia,Turkey,44.576636,,Turkey,38.9637,35.2433,0,0,0,...,28662,27259,24898,23637,22583,21624,23852,25101,23867,22234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Asia,Laos,0.671415,,Laos,19.85627,102.495496,0,0,0,...,1050,1139,946,1198,1032,985,826,1306,1073,1401
96,Europe,North Macedonia,0.666628,,North Macedonia,41.6086,21.7453,0,0,0,...,677,656,432,469,525,285,172,522,476,491
97,America,Dominican Republic,0.658873,,Dominican Republic,18.7357,-70.1627,0,0,0,...,968,1046,1119,1160,1225,1248,1160,1002,862,893
98,Europe,Montenegro,0.646582,,Montenegro,42.708678,19.37439,0,0,0,...,732,584,480,479,458,357,324,520,403,409


In [21]:
#creating dictionary which contains dates as column names for each week
#Note: starting day of the week has been calulated by considering the first date of dataset as first day of week.
date_list = [_ for _ in df_join_concat.columns if _ not in ['Continent','Country','Coefficient','State','Lat','Long','States_New']]
dates_by_week ={}
for index,dates in enumerate(date_list):
    try:
        if index%7==0:
            week_date = "week_"+str(index//7+1)
            if week_date in dates_by_week.keys():
                dates_by_week[week_date].append(dates)
            else:
                dates_by_week[week_date] = [dates]
        else:
            dates_by_week[week_date].append(dates)
            
    except:
        pass

for k,v in list(dates_by_week.items()):
    if len(v)<7:
        dates_by_week.pop(k,None)
print(dates_by_week)


{'week_1': ['01/22/2020', '01/23/2020', '01/24/2020', '01/25/2020', '01/26/2020', '01/27/2020', '01/28/2020'], 'week_2': ['01/29/2020', '01/30/2020', '01/31/2020', '02/01/2020', '02/02/2020', '02/03/2020', '02/04/2020'], 'week_3': ['02/05/2020', '02/06/2020', '02/07/2020', '02/08/2020', '02/09/2020', '02/10/2020', '02/11/2020'], 'week_4': ['02/12/2020', '02/13/2020', '02/14/2020', '02/15/2020', '02/16/2020', '02/17/2020', '02/18/2020'], 'week_5': ['02/19/2020', '02/20/2020', '02/21/2020', '02/22/2020', '02/23/2020', '02/24/2020', '02/25/2020'], 'week_6': ['02/26/2020', '02/27/2020', '02/28/2020', '02/29/2020', '03/01/2020', '03/02/2020', '03/03/2020'], 'week_7': ['03/04/2020', '03/05/2020', '03/06/2020', '03/07/2020', '03/08/2020', '03/09/2020', '03/10/2020'], 'week_8': ['03/11/2020', '03/12/2020', '03/13/2020', '03/14/2020', '03/15/2020', '03/16/2020', '03/17/2020'], 'week_9': ['03/18/2020', '03/19/2020', '03/20/2020', '03/21/2020', '03/22/2020', '03/23/2020', '03/24/2020'], 'week_10'

In [22]:
#calculating the mean, standard deviation, minimum and maximum value for each week and appending them in new column
for k,v in dates_by_week.items():
    try:
        df_join_concat[k+"_mean"]= df_join_concat[[v[0],v[1],v[2],v[3],v[4],v[5],v[6]]].mean(axis=1)
        df_join_concat[k+"_std"]= df_join_concat[[v[0],v[1],v[2],v[3],v[4],v[5],v[6]]].std(axis=1,ddof=0)
        df_join_concat[k+"_min"]= df_join_concat[[v[0],v[1],v[2],v[3],v[4],v[5],v[6]]].min(axis=1)
        df_join_concat[k+"_max"]= df_join_concat[[v[0],v[1],v[2],v[3],v[4],v[5],v[6]]].max(axis=1)
    except:
        pass
display(df_join_concat)

Unnamed: 0,Continent,States_New,Coefficient,State,Country,Lat,Long,01/22/2020,01/23/2020,01/24/2020,...,week_93_min,week_93_max,week_94_mean,week_94_std,week_94_min,week_94_max,week_95_mean,week_95_std,week_95_min,week_95_max
0,America,US,123.238569,,US,40,-100,1,0,1,...,30979,115597,74701.000000,27659.157275,32852,112322,84804.000000,42400.228564,30161,146251
1,Asia,India,109.639824,,India,20.593684,78.96288,0,0,0,...,10423,16156,11491.285714,933.347847,10126,12885,11145.571429,1368.816784,8865,13091
2,Europe,United Kingdom,52.895127,,United Kingdom,55.3781,-3.436,0,0,0,...,33546,43922,33688.428571,3583.558880,29843,40803,38644.857143,1913.875159,36128,42401
3,America,Brazil,48.584339,,Brazil,-14.235,-51.9253,0,0,0,...,3838,17184,10843.000000,3327.724491,5638,14661,9808.428571,5180.170512,2799,15300
4,Asia,Turkey,44.576636,,Turkey,38.9637,35.2433,0,0,0,...,23096,29796,28386.142857,888.967046,27304,29764,24136.285714,1703.635081,21624,27259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Asia,Laos,0.671415,,Laos,19.85627,102.495496,0,0,0,...,447,873,1009.000000,113.092882,786,1170,1061.714286,151.158114,826,1306
96,Europe,North Macedonia,0.666628,,North Macedonia,41.6086,21.7453,0,0,0,...,153,593,516.000000,159.709558,156,677,437.285714,149.908271,172,656
97,America,Dominican Republic,0.658873,,Dominican Republic,18.7357,-70.1627,0,0,0,...,693,1102,963.000000,67.150152,891,1104,1137.142857,82.767193,1002,1248
98,Europe,Montenegro,0.646582,,Montenegro,42.708678,19.37439,0,0,0,...,378,622,630.714286,108.615593,418,738,457.428571,83.494837,324,584


In [23]:
#converting the dataframe back to pysark dataframe
df_pd = spark.createDataFrame(df_join_concat)
df_pd.columns

['Continent',
 'States_New',
 'Coefficient',
 'State',
 'Country',
 'Lat',
 'Long',
 '01/22/2020',
 '01/23/2020',
 '01/24/2020',
 '01/25/2020',
 '01/26/2020',
 '01/27/2020',
 '01/28/2020',
 '01/29/2020',
 '01/30/2020',
 '01/31/2020',
 '02/01/2020',
 '02/02/2020',
 '02/03/2020',
 '02/04/2020',
 '02/05/2020',
 '02/06/2020',
 '02/07/2020',
 '02/08/2020',
 '02/09/2020',
 '02/10/2020',
 '02/11/2020',
 '02/12/2020',
 '02/13/2020',
 '02/14/2020',
 '02/15/2020',
 '02/16/2020',
 '02/17/2020',
 '02/18/2020',
 '02/19/2020',
 '02/20/2020',
 '02/21/2020',
 '02/22/2020',
 '02/23/2020',
 '02/24/2020',
 '02/25/2020',
 '02/26/2020',
 '02/27/2020',
 '02/28/2020',
 '02/29/2020',
 '03/01/2020',
 '03/02/2020',
 '03/03/2020',
 '03/04/2020',
 '03/05/2020',
 '03/06/2020',
 '03/07/2020',
 '03/08/2020',
 '03/09/2020',
 '03/10/2020',
 '03/11/2020',
 '03/12/2020',
 '03/13/2020',
 '03/14/2020',
 '03/15/2020',
 '03/16/2020',
 '03/17/2020',
 '03/18/2020',
 '03/19/2020',
 '03/20/2020',
 '03/21/2020',
 '03/22/2020',
 

In [24]:
# creating dictionary which will be used for aggregation while apply groupby for the dataframe

mean_col_dict = {col+"_mean":'mean' for col in dates_by_week.keys()}
print(mean_col_dict)

std_col_dict = {col+"_std":'stddev' for col in dates_by_week.keys()}
print(std_col_dict)

min_col_dict = {col+"_min":'min' for col in dates_by_week.keys()}
print(min_col_dict)

max_col_dict = {col+"_max":'max' for col in dates_by_week.keys()}
print(max_col_dict)

{'week_1_mean': 'mean', 'week_2_mean': 'mean', 'week_3_mean': 'mean', 'week_4_mean': 'mean', 'week_5_mean': 'mean', 'week_6_mean': 'mean', 'week_7_mean': 'mean', 'week_8_mean': 'mean', 'week_9_mean': 'mean', 'week_10_mean': 'mean', 'week_11_mean': 'mean', 'week_12_mean': 'mean', 'week_13_mean': 'mean', 'week_14_mean': 'mean', 'week_15_mean': 'mean', 'week_16_mean': 'mean', 'week_17_mean': 'mean', 'week_18_mean': 'mean', 'week_19_mean': 'mean', 'week_20_mean': 'mean', 'week_21_mean': 'mean', 'week_22_mean': 'mean', 'week_23_mean': 'mean', 'week_24_mean': 'mean', 'week_25_mean': 'mean', 'week_26_mean': 'mean', 'week_27_mean': 'mean', 'week_28_mean': 'mean', 'week_29_mean': 'mean', 'week_30_mean': 'mean', 'week_31_mean': 'mean', 'week_32_mean': 'mean', 'week_33_mean': 'mean', 'week_34_mean': 'mean', 'week_35_mean': 'mean', 'week_36_mean': 'mean', 'week_37_mean': 'mean', 'week_38_mean': 'mean', 'week_39_mean': 'mean', 'week_40_mean': 'mean', 'week_41_mean': 'mean', 'week_42_mean': 'mean', 

In [25]:
#Grouping the dataset continent wise, and aggregating the dataset weekly to calculate weekly mean, standard_deviation, minimum
# and maximum values each week and saving the results to csv file.
df_pd_mean = df_pd.groupby('Continent').agg(mean_col_dict).toPandas()
df_pd_mean.to_csv("continent_mean.csv")

df_pd_std = df_pd.groupby('Continent').agg(std_col_dict).toPandas()
df_pd_std.to_csv("continent_stddev.csv")

df_pd_min = df_pd.groupby('Continent').agg(min_col_dict).toPandas()
df_pd_min.to_csv("continent_min.csv")

df_pd_max = df_pd.groupby('Continent').agg(max_col_dict).toPandas()
df_pd_max.to_csv("continent_max.csv")
print("--- %s seconds ---" % (time.time() - start_time))

--- 5577.013657331467 seconds ---
