# Process COVID19 data, and make machine learning predictions over it

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from glob import glob


In [38]:
DEBUG=True

In [40]:
# get the last version of the daily reports
all_report_files=glob('csse_covid_19_data\csse_covid_19_daily_reports\*.csv')
all_filenames=[]
for file in all_report_files:
    filename= file.split('\\')[2]
    all_filenames.append(filename)
all_filenames.sort(reverse=True)
last_file_path='csse_covid_19_data/csse_covid_19_daily_reports/'+all_filenames[0]

In [43]:
data = pd.read_csv(last_file_path, parse_dates=['Last Update'])
data[:10]

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,China,2020-03-13 11:09:03,67786,3062,51553,30.9756,112.2707
1,Guangdong,China,2020-03-13 11:09:03,1356,8,1296,23.3417,113.4244
2,Henan,China,2020-03-11 08:13:09,1273,22,1249,33.882,113.614
3,Zhejiang,China,2020-03-12 01:33:02,1215,1,1197,29.1832,120.0934
4,Hunan,China,2020-03-13 11:09:03,1018,4,1005,27.6104,111.7088
5,Anhui,China,2020-03-11 02:18:14,990,6,984,31.8257,117.2264
6,Jiangxi,China,2020-03-12 02:13:04,935,1,934,27.614,115.7221
7,Shandong,China,2020-03-13 11:09:03,760,7,739,36.3427,118.1498
8,Jiangsu,China,2020-03-13 11:09:03,631,0,630,32.9711,119.455
9,Chongqing,China,2020-03-13 23:13:12,576,6,566,30.0572,107.874


In [42]:
if DEBUG:
    rows, features=data.shape
    print(f'Quantity of registers: {rows}')
    print(f'Quantity of features: {features}')


Quantity of registers: 231
Quantity of features: 8


### Start preprocessing and cleaning the data

In [71]:
data_transforms=ColumnTransformer([
    ('fill_states_nulls'),
    ('fill_countries_nulls')
])

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
201    False
202    False
203    False
204    False
205    False
206    False
207    False
208    False
209    False
210    False
211    False
212    False
213    False
214    False
215    False
216    False
217    False
218    False
219    False
220    False
221    False
222    False
223    False
224    False
225    False
226    False
227    False
228    False
229    False
230    False
Name: Country/Region, Length: 231, dtype: bool

In [61]:
for i in data.index:
    if data.loc[i,'Last Update'].startswith('##'):
        print(f'Removing instances {i}')
        data.drop(index=i,inplace=True)
print('Last update column cleaned')

Removing instances 16
Removing instances 26
Removing instances 87
Last update column cleaned


In [62]:
data['Confirmed']=data['Confirmed'].fillna(0)
data['Deaths']=data['Deaths'].fillna(0)
data['Recovered']=data['Recovered'].fillna(0)

In [63]:
data.dtypes

Date               object
Province/State     object
Country            object
Last Update        object
Confirmed         float64
Deaths            float64
Recovered         float64
dtype: object

In [64]:
data['Last Update']=pd.to_datetime(data['Last Update'])
for i in data.index:
    data.at[i,'Day']=data.loc[i,'Last Update'].dayofyear
    

In [65]:
X = data[['Province/State', 'Country', 'Day']]
y = data['Confirmed']

In [68]:
column_transform= ColumnTransformer([
    ('Missing providences', SimpleImputer(strategy='constant', fill_value='Country'), ['Province/State']),
    ('Missing country', SimpleImputer(strategy='constant', fill_value='Place'), ['Country'])
], remainder='passthrough')

In [69]:
X = column_transform.fit_transform(X)

In [73]:
column_transform=ColumnTransformer([('Onehot', OneHotEncoder(dtype='int'), [0,1])], remainder='passthrough')

In [75]:
X = column_transform.fit_transform(X)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2, random_state=8)

In [79]:
X_train=np.nan_to_num(X_train)

In [81]:
regression = linear_model.LinearRegression()
regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [82]:
regression.score(X_test, y_test)

0.0

In [83]:
data_to_predict = ['Hubei','Mainland China',100]
data_to_predict = column_transform.transform([data_to_predict])
contaminados=regression.predict(data_to_predict)

In [84]:
print(f'habran estos contaminados: {contaminados}')

habran estos contaminados: [21950.68911718]
