In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Data

To begin, there are two things we *have* to do, and three things we *should* do. 

We *have to*:

1. read/load data
2. ensure the data contains only numbers

If we don't do these things, we cannot train a model using any statistical software

We *should*:

3. identify target column
4. normalize / whiten the data
5. reduce the dimensionality

We can technically train a model without 3 & 4, but it wouldn't likely be useful.

Let's start with a generic function that calls some specialized functions. When the generic function is done, we can move on from data.

In [2]:
def prep_data(dataset, source='ddk_covid', do_the_minimum=True, target_column='case_status'):
    """
    Generic function calling all subroutines needed to load and prepare data
    
    Args:
        source (str): string corresponding to data source ID or path to source
    """
    
    raw_data = read_data(source)
    numeric = numberify(raw_data)
    
    if do_the_minimum:
        return numeric
    else:
        y, X = split_xy(data, target_column)
        X = normalize(X)
        # X = reduce_data(X, method)
        return X, y
    

In [3]:
def read_data(data_source):
    """
    Reads data from specified source
    """
    
    print("Reading {}...".format(data_source))
    
    try:
        df = {
            "ddk_covid": pd.read_csv(
                "https://bitbucket.org/DrDeadKnee/canada_covid_mldata/raw/886c3e736190327a3f6909d091a133c2c9756d1b/data/prepared/augmented_cases.csv"
            )
        }[data_source]
    except KeyError:
        ext = data_source.split(".")[-1]
        df = {
            "csv": pd.read_csv,
            "parquet": pd.read_parquet,
            "excel": pd.read_excel
        }[ext](data_source)
        
    print("Data read with {} rows and {} columns".format(df.shape[0], df.shape[1]))
    return df
    
df = read_data("ddk_covid")
df

Reading ddk_covid...
Data read with 408362 rows and 27 columns


Unnamed: 0,case_status,age_group,gender,longitude,latitude,exposure,days_lapsed,province,hr_uid,synth_smoking,...,p_rural,p_immigrant,p_aboriginal,p_lowinc_haskid,p_food_insecure,p_diabetes,p_copd,p_high_bp,p_mood_disord,synth_food
0,0,64,0,-79.813571,43.761613,Outbreak,135,0,3553.0,1,...,1.8,51.5,0.7,18.9,7.9,9.7,3.2,18.5,7.5,0
1,0,44,1,-79.813571,43.761613,Close Contact,150,0,3553.0,1,...,1.8,51.5,0.7,18.9,7.9,9.7,3.2,18.5,7.5,0
2,0,54,0,-79.813571,43.761613,Close Contact,131,0,3553.0,1,...,1.8,51.5,0.7,18.9,7.9,9.7,3.2,18.5,7.5,0
3,0,54,1,-79.813571,43.761613,Outbreak,116,0,3553.0,1,...,1.8,51.5,0.7,18.9,7.9,9.7,3.2,18.5,7.5,0
4,0,44,0,-79.890508,43.512097,Close Contact,164,0,3536.0,1,...,4.4,29.6,1.0,9.2,3.5,4.5,2.7,17.8,7.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408357,0,64,1,-111.190357,50.372746,Not Reported,317,1,4831.0,0,...,25.8,12.7,7.0,9.9,7.5,6.4,5.3,16.9,10.6,0
408358,0,64,1,-111.190357,50.372746,Not Reported,412,1,4831.0,0,...,25.8,12.7,7.0,9.9,7.5,6.4,5.3,16.9,10.6,0
408359,0,15,0,-115.190802,56.806199,Not Reported,283,1,4835.0,0,...,44.8,9.5,17.1,7.5,6.7,6.5,6.1,16.6,8.3,0
408360,0,34,0,-111.775044,52.617887,Not Reported,339,1,4833.0,0,...,39.8,8.7,7.7,8.3,7.5,6.1,4.0,15.8,7.1,0


In [4]:
df.dtypes

case_status          int64
age_group            int64
gender               int64
longitude          float64
latitude           float64
exposure            object
days_lapsed          int64
province             int64
hr_uid             float64
synth_smoking        int64
psmokes            float64
mean_doc_age       float64
perc_doc_cndn      float64
perc_doc_fmale     float64
spec_per_100k      float64
synth_doc_data       int64
docs_per_100k      float64
p_rural            float64
p_immigrant        float64
p_aboriginal       float64
p_lowinc_haskid    float64
p_food_insecure    float64
p_diabetes         float64
p_copd             float64
p_high_bp          float64
p_mood_disord      float64
synth_food           int64
dtype: object

In [5]:
df.exposure.value_counts()

Not Reported      228627
Close Contact     123667
Outbreak           50831
Travel-Related      5237
Name: exposure, dtype: int64

In [6]:
df.isna().sum()

case_status        0
age_group          0
gender             0
longitude          0
latitude           0
exposure           0
days_lapsed        0
province           0
hr_uid             0
synth_smoking      0
psmokes            0
mean_doc_age       0
perc_doc_cndn      0
perc_doc_fmale     0
spec_per_100k      0
synth_doc_data     0
docs_per_100k      0
p_rural            0
p_immigrant        0
p_aboriginal       0
p_lowinc_haskid    0
p_food_insecure    0
p_diabetes         0
p_copd             0
p_high_bp          0
p_mood_disord      0
synth_food         0
dtype: int64

In [7]:
def numberify(data):
    """
    Removes any object-encoded columns and one-hot encodes them
    """
    
    data = data.copy()
    types = data.dtypes
    
    strings = []
    for i in range(len(types.index)):
        if types[i] == "object":
            strings.append(types.index[i])
            
    print("One-hot encoding object columns: {}".format(", ".join(strings)))
    for i in strings:
        ints = pd.get_dummies(data[i])
        del data[i]
        data = data.merge(ints, left_index=True, right_index=True)
        
    return data

df2 = numberify(df)
df2.dtypes

One-hot encoding object columns: exposure


case_status          int64
age_group            int64
gender               int64
longitude          float64
latitude           float64
days_lapsed          int64
province             int64
hr_uid             float64
synth_smoking        int64
psmokes            float64
mean_doc_age       float64
perc_doc_cndn      float64
perc_doc_fmale     float64
spec_per_100k      float64
synth_doc_data       int64
docs_per_100k      float64
p_rural            float64
p_immigrant        float64
p_aboriginal       float64
p_lowinc_haskid    float64
p_food_insecure    float64
p_diabetes         float64
p_copd             float64
p_high_bp          float64
p_mood_disord      float64
synth_food           int64
Close Contact        uint8
Not Reported         uint8
Outbreak             uint8
Travel-Related       uint8
dtype: object

In [17]:
def split_train_test(data, trainp=0.8):
    """
    Randomly splits data into two sets
    """    
    train = data.sample(frac=0.8)
    test = data[~data.index.isin(train.index)]
    return train.reset_index(), test.reset_index()

train, test = split_train_test(df2)

In [21]:
def split_xy(data, target):
    """
    Splits the data from the target column
    """
    data = data.copy()
    y = data[target].tolist()
    del data[target]
    
    return data, y

X, y = split_xy(train, "case_status")
testX, testy = split_xy(train, "case_status")

In [19]:
def normalize(X):
    """
    Centers features and renormalizes by standard deviation
    """
    
    X = X.copy()

    for i in X:
        X[i] = (X[i] - X[i].mean()) / X[i].std()
    
    return X

# Simple

Lets try some basic models first. No regularization, feature reduction, hyperparameter search. 

If you're not familiar with Linear Regression, go read a book:

Linear Regression:
$$ m(x) = \beta_0 + \beta \cdot X $$

Logistic regressing is very similar, but outputs values between 0 and 1
Logistic Regression:
$$ m(x) = \frac{1}{1 + e^{-(\beta_0 + \beta \cdot X)}} $$

It can be shown that in some contexts, these values correspond to probabilities.

In [25]:
from sklearn.linear_model import LogisticRegression

m_x = LogisticRegression(penalty="none")
m_x.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(penalty='none')

In [28]:
m_x.score(X, y)

0.9864764761700695

In [29]:
m_x.score(testX, testy)

0.9864764761700695

In [None]:
m_x.score