In [1]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn import linear_model
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, PolynomialFeatures



In [60]:
file_location='2019ncov_data.csv'

data = pd.read_csv(file_location, index_col=0, parse_dates=['Last Update'])
number_of_rows, features = data.shape
print(f'The number of file rows are: {number_of_rows}')
print(f'The number of features are: {features}')

new_index={i for i in range(number_of_rows)}
data['new_index']=new_index
data.set_index('new_index', inplace= True)
data.iloc[:5]

The number of file rows are: 1719
The number of features are: 7


Unnamed: 0_level_0,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,01/22/2020 12:00:00,Anhui,China,01/22/2020 12:00:00,1.0,0.0,0.0
1,01/22/2020 12:00:00,Beijing,China,01/22/2020 12:00:00,14.0,0.0,0.0
2,01/22/2020 12:00:00,Chongqing,China,01/22/2020 12:00:00,6.0,0.0,0.0
3,01/22/2020 12:00:00,Fujian,China,01/22/2020 12:00:00,1.0,0.0,0.0
4,01/22/2020 12:00:00,Gansu,China,01/22/2020 12:00:00,0.0,0.0,0.0


### Data preprocessing and mining

In [61]:
for i in data.index:
    if data.loc[i,'Last Update'].startswith('##'):
        print(f'Removing instances {i}')
        data.drop(index=i,inplace=True)
print('Last update column cleaned')

Removing instances 16
Removing instances 26
Removing instances 87
Last update column cleaned


In [62]:
data['Confirmed']=data['Confirmed'].fillna(0)
data['Deaths']=data['Deaths'].fillna(0)
data['Recovered']=data['Recovered'].fillna(0)

In [63]:
data.dtypes

Date               object
Province/State     object
Country            object
Last Update        object
Confirmed         float64
Deaths            float64
Recovered         float64
dtype: object

In [64]:
data['Last Update']=pd.to_datetime(data['Last Update'])
for i in data.index:
    data.at[i,'Day']=data.loc[i,'Last Update'].dayofyear
    

In [65]:
X = data[['Province/State', 'Country', 'Day']]
y = data['Confirmed']

In [68]:
column_transform= ColumnTransformer([
    ('Missing providences', SimpleImputer(strategy='constant', fill_value='Country'), ['Province/State']),
    ('Missing country', SimpleImputer(strategy='constant', fill_value='Place'), ['Country'])
], remainder='passthrough')

In [69]:
X = column_transform.fit_transform(X)

In [73]:
column_transform=ColumnTransformer([('Onehot', OneHotEncoder(dtype='int'), [0,1])], remainder='passthrough')

In [75]:
X = column_transform.fit_transform(X)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2, random_state=8)

In [79]:
X_train=np.nan_to_num(X_train)

In [81]:
regression = linear_model.LinearRegression()
regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [82]:
regression.score(X_test, y_test)

0.0

In [83]:
data_to_predict = ['Hubei','Mainland China',100]
data_to_predict = column_transform.transform([data_to_predict])
contaminados=regression.predict(data_to_predict)

In [84]:
print(f'habran estos contaminados: {contaminados}')

habran estos contaminados: [21950.68911718]
