# COVID_19 DataBase

## Origin: Mexico Government 

### Columns:

__SEXO__: 1 - Mujer, 2 - Hombre, 99 - N/A

__ENTIDAD_RES__: data description DataFrame

__FECHA_SINTOMAS__: DATE

__FECHA_DEF__: DATE

__EDAD__: Numerical

__DIABETES__: 1 - SI, 2 - NO, ELSE: N/A

__HIPERTENSION__ 1 - SI, 2 - NO, ELSE: N/A

__CARDIOVSCULAR__ 1 - SI, 2 - NO, ELSE: N/A

__OBESIDAD__ 1 - SI, 2 - NO, ELSE: N/A

__TABAQUISMO__ 1 - SI, 2 - NO, ELSE: N/A

__RESULTADO__ 1 - POSITIVO COVID, 2 - NO POSITIVO COVID, ELSE: PENDIENTE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
plt.style.use('ggplot')


In [None]:
class DataShell:
    
    
    def __init__(self, path = None, data_frame = None, cols = None):
        
        if path is None and data_frame is None:
            self.data = pd.DataFrame()
            
        if path is not None:
            if cols is None:
                self.data = pd.read_csv(path)
            else:
                self.data = pd.read_csv(path, usecols = cols)
        if data_frame is not None:
            self.data = data_frame
        
        self.description = self.data.info()
        self.main = self.data.copy()
        
        self.y = []
        self.X = []
        
        self.X_train = [] 
        self.X_test =[]
        self.y_train = []
        self.y_test = []
        
        
                    
    def get_data(self):
        return self.data
    
    def set_output(self, column_name = None):
        
        self.y = self.data[[column_name]]
        self.X = self.data.drop(column_name)
         
        
    
    def generate_split(self, *arrays, **options):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
        *arrays, **options)
        
    def set_main_df(self, dataframe):
        self.data = dataframe.copy()

    def subset_data(self, column = None, condition = None, equal = True):
        
        if equal is True:
            return self.data[self.data[column] == condition]
        else:
            return self.data[self.data[column] != condition]
    
    
        
    



    
    
            
        
        

In [None]:
class Covid19DataShell(DataShell):
    
    def __init__(self, path = None, data_frame = None, cols = None):
        super().__init__(path = path, data_frame = data_frame, cols = cols)
        self.positive = None
        self.negative = None 
        self.deceased = None
        
        
    
    

    
    def positive_cases(self, column = None, condition = None, inplace = True):
          
        if column is None:
            self.positive = self.subset_data(column = 'RESULTADO', condition = 1)
        else:
            self.positive = self.subset_data(column = columns, condition = condition)
            
        if inplace is True:
            self.data = self.positive
            
        return self.positive
            
            
    def set_country_info(self, path = None):
        
        self.countries = pd.read_csv(path)
        return self.countries
    
    
    def join_main_dataset(self, data_set, l_column, r_column):
        return pd.merge(self.data, data_set, how = 'left',
                        left_on = l_column, right_on = r_column).drop(r_column,
                                                                      axis = 1)
## Change set as date to more generic 
    @staticmethod
    def set_as_date(data_frame, in_col = None, out_col = None):
        
        if out_col is None:
            out_col = in_col
        
                
        temp = data_frame.assign(DATE = pd.to_datetime(data_frame[in_col]))
        
        if out_col == in_col:
            temp.drop(in_col, axis = 1, inplace = True)
            temp.rename(columns={'DATE': out_col}, inplace = True)   
            
        return temp
        

    def set_deceased_cases(self, column = 'FECHA_DEF', condition = '9999-99-99'):
        self.deceased = self.subset_data(column = column, condition = condition, equal = False)
        self.deceased = self.set_as_date(self.deceased, in_col = column)
        print(type( self.deceased))
        return self.deceased
    
    
    def get_deceased_cases(self, column = 'FECHA_DEF', condition = '9999-99-99'):
        if self.deceased is None:
            self.set_deceased_cases()
            
        return self.deceased

    def time_curve(self, indeces = ['FECHA_SINTOMAS', 'ESTADO_RES'],
                    value = 'ACTIVE', columns = None, data = 'POSITIVE'):
        
        if columns is not None:
            time_slice = time_slice[[columns]]
        
        if data == 'POSITIVE':
            temp = self.positive
        if data == 'DECEASED':
            temp = self.deceased
        
        time_slice = pd.DataFrame((temp.groupby(indeces).size()), columns=[value])
        display(time_slice.head())
        time_slice = time_slice.reset_index()
        display(time_slice)
        print(main_index)
        time_slice = time_slice.pivot(index = indeces[0], columns = indeces[1,], values = [value]).cumsum()
        time_slice.fillna(0)

        return time_slice

    def SIR_curves(self, gender = None, comorbidity = None):
        
        columns = None
        if gender is not None:
            columns = ['SEXO']
        elif comorbidity is not None:
            columns.append(comorbidity)
        
        self.active = self.time_curve(columns = columns, data = 'POSITIVE')
        self.passed = self.time_curve(columns = columns, data = 'DECEASED')
        
        return self.active, self.passed

        

In [None]:
cols = ['SEXO', 'ENTIDAD_RES', 'FECHA_SINTOMAS', 'FECHA_DEF', 'EDAD', 
        'DIABETES', 'HIPERTENSION', 'CARDIOVASCULAR', 'OBESIDAD', 
        'TABAQUISMO', 'RESULTADO']
data = pd.read_csv('200811COVID19MEXICO.csv', usecols=cols)
covid_data = Covid19DataShell('200811COVID19MEXICO.csv', cols = cols)
covid_data.positive_cases()


In [None]:
countries_df = covid_data.set_country_info('entidades.csv')
countries_df.head()

In [None]:
#pd.merge(covid_data.get_data(), countries_df, left_on = 'ENTIDAD_RES', right_on = 'CLAVE_ENTIDAD', how = 'inner')
covid_data.join_main_dataset(countries_df, 'ENTIDAD_RES', 'CLAVE_ENTIDAD')

In [None]:
date_df = covid_data.set_as_date(covid_data.get_data(), in_col='FECHA_SINTOMAS')
date_df.info()
deceased = covid_data.set_deceased_cases()

test = covid_data.main
test = covid_data.set_as_date(test, in_col='FECHA_SINTOMAS')
test.info()

In [None]:

covid_data.get_data().info()

In [None]:
new = covid_data.get_data().assign(FECHA_SINTOMAS = pd.to_datetime(covid_data.get_data().FECHA_SINTOMAS))
print(id(covid_data.get_data()))
print(id(covid_data.subset_data))
print(new is covid_data.get_data())

display(covid_data.get_data().info())

In [None]:
covid_data.get_deceased_cases().info()

In [None]:
infected = pd.DataFrame((covid_data.get_deceased_cases().groupby(['FECHA_SINTOMAS','ENTIDAD_RES']).size()), columns=['DECEASED'])
display(infected.head())
infected = infected.reset_index()
infected = infected.pivot(index = 'FECHA_SINTOMAS', columns = 'ENTIDAD_RES', values = 'DECEASED' ).cumsum()
display(infected.head().cumsum())
infected.fillna(0)

infected.plot(figsize = (16,10))
print('FECHA_SINTOMAS')

In [None]:
covid_data.SIR_curves()

In [None]:
covid_data.get_data_curves()
