In [1]:
import simulate_data
import remove_data
import matrix_completion
import trunc_nnm
import errors

from sklearn.preprocessing import normalize
import numpy as np
import numpy.linalg as la
import pandas as pd

In [2]:
# simulate data
sim_df = simulate_data.sim_data(**simulate_data.sim_params)

# drop categorical feature for now
sim_df = sim_df.drop("Group", axis=1)

# Normalize data by feature (axis=0)
# norms we could use for re-scaling later
sim_norm, norms = normalize(sim_df, axis=0, return_norm=True)
pd.DataFrame(sim_norm).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.154555,0.06426,0.070888,0.119523,0.0,0.0,0.101535,0.0,-0.02579,0.073764,0.06336
1,0.076481,0.115322,0.070888,0.119523,0.0,0.136083,0.101535,0.0,0.108082,0.058559,0.114662
2,0.046207,0.039233,0.070888,0.119523,0.0,0.136083,0.101535,0.0,0.0,0.009263,0.037587
3,0.070108,0.125841,0.070888,0.119523,0.0,0.136083,0.101535,0.0,0.161677,0.0,0.123666
4,0.043021,0.076243,0.141776,0.0,0.0,0.0,0.101535,0.404643,0.0,0.066675,0.079655


In [3]:
# NNM parameters
parameters = {"eps_outer": 1e-6,
              "eps_inner": 1e-6,
              "beta": 1,
              "max_iter_outer": 1000,
              "max_iter_inner": 1000}

### How do error rates vary by rate of missingness?

In [11]:
results_lst = []

for missingness_pct in np.arange(0, 100, 5):
    sim_obs = remove_data.missing_at_random(sim_norm, perc_remove=missingness_pct, rand_seed=23)
    
    sim_recovered_svt = matrix_completion.svt(sim_obs, tau=5)
    svt_error = la.norm(sim_norm - sim_recovered_svt, ord='fro')

#     rank = la.matrix_rank(sim_norm)
#     sim_recovered_nnm = trunc_nnm.truncated_NNM(rank, parameters, sim_obs)
#     nnm_error = la.norm(sim_norm - sim_recovered_nnm, ord='fro')
    
    nnm_error=0
    results_lst.append([missingness_pct, svt_error, nnm_error])

KeyboardInterrupt: 

In [None]:
results_df = pd.DataFrame(results_lst, columns=["missingness", "SVT", "NNM"]).set_index("missingness")

errors.line_plot(results_df.index, results_df.SVT, results_df.NNM, 
                 save=True,
                 title="Error Rate vs Missingness",
                 xlabel="Percent of data missing",
                 ylabel="Reconstruction error (Frobenius norm)")

Error rates increased sharply from no missingness to about 10% missingness, then increased at a lesser rate.

### How does missingness affect error rates by rank?

In [None]:
missingness_rank_df = pd.DataFrame(np.arange(0, 100, 5).tolist(), columns=["missingness"])

for rank in range(1, 11):
    error_lst = []
    for missingness in np.arange(0, 100, 5):
        # create rank r matrix
        sim_arr = simulate_data.create_rank_r_matrix(rank, 100, 10)

        # normalize
#         sim_norm, norms = normalize(sim_arr, axis=0, return_norm=True)

        # remove data at random
        sim_obs = remove_data.missing_at_random(sim_arr, perc_remove=missingness, rand_seed=23)

        sim_recovered_svt = matrix_completion.svt(sim_obs, tau=5)
        svt_error = la.norm(sim_arr - sim_recovered_svt, ord='fro')
        
        error_lst.append(svt_error)

    missingness_rank_df[f"rank_{rank}"] = error_lst

In [None]:
# results_df = pd.DataFrame(results_lst, columns=["missingness", "SVT", "NNM"]).set_index("missingness")
missingness_rank_df.set_index("missingness").plot(title="Reconstruction Error by Rank and Missingness")
pass

##### Here we hold the error rate at 40% and change $\tau$ for SVT

In [None]:
tau_lst = []

for tau in np.arange(0, 10, 1):
    sim_obs = remove_data.missing_at_random(sim_norm, perc_remove=40, rand_seed=23)
    
    sim_recovered_svt = matrix_completion.svt(sim_obs, tau=tau)
    svt_error = la.norm(np.round(sim_norm, 4) - np.round(sim_recovered_svt, 4), ord='fro')
    
    tau_lst.append([tau, svt_error])

In [None]:
tau_df = pd.DataFrame(tau_lst, columns=["tau", "error"]).set_index("tau")
tau_df.plot(title="Error rate depending on tau")

SVT seems to want $\tau$ to be somewhere midway between 0 and the rank of the original matrix. Should test on other data.

### How do error rates vary by rank of the matrix?

In [None]:
# NNM parameters
parameters = {"eps_outer": 1e-6,
              "eps_inner": 1e-6,
              "beta": 1,
              "max_iter_outer": 1000,
              "max_iter_inner": 1000}


Here we vary rank for a synthetic dataset with 10% missing values

In [None]:
results_lst = []
n = 200
p = 10

for rank in [2]:

    # Produce r-rank data
    sim_norm = simulate_data.create_rank_r_matrix(rank, n, p)
    # Set to only 10% missing
    sim_obs = remove_data.missing_at_random(sim_norm, perc_remove=10, rand_seed=23)
    
    print(sim_obs)
    sim_recovered_svt = matrix_completion.svt(sim_obs, tau=5)
    svt_error = la.norm(sim_norm - sim_recovered_svt, ord='fro')
#     print(sim_recovered_svt.astype(int))
    
    print(sim_obs)
    sim_recovered_nnm = trunc_nnm.truncated_NNM(rank, parameters, sim_obs)
    nnm_error = la.norm(sim_norm - sim_recovered_nnm, ord='fro')
    
#     sim_recovered_nnm = trunc_nnm.truncated_NNM(rank, parameters, sim_obs)
#     nnm_error = la.norm(sim_norm - sim_recovered_nnm, ord='fro')
    
    results_lst.append([rank, svt_error, nnm_error])

In [None]:
results_df = pd.DataFrame(results_lst, columns=["rank", "SVT", "NNM"]).set_index("rank")

errors.line_plot(results_df.index, results_df.SVT, results_df.NNM, 
                 save=True,
                 title="Error Rate vs Matrix Rank",
                 xlabel="Rank of the original matrix",
                 ylabel="Reconstruction error (Frobenius norm)")

The reconstruction error increases fairly linearly with rank.

### How do error rates vary by the type of missingness?

In [None]:
# remove the last column
original_normalized_arr = sim_norm[:,:10]
original_normalized_df = pd.DataFrame(original_normalized_arr, columns=sim_df.columns[:10])

In [None]:
def conditionally_remove_data(df, perc_missing):

    # make Income missing conditional on age
    sim_df = remove_data.missing_conditional_continuous(df, 
                                                     "Income", 
                                                     "Income", 
                                                     percent_missing=perc_missing)

    # make Age missing conditional on itself
    sim_df = remove_data.missing_conditional_continuous(sim_df, 
                                                     "Age", 
                                                     "Age", 
                                                     percent_missing=perc_missing)

    # make NChild missing conditional on Continuous_EvenLikelihood_0
    sim_df = remove_data.missing_conditional_continuous(sim_df, 
                                                     "NChild", 
                                                     "Continuous_EvenLikelihood_0", 
                                                     percent_missing=perc_missing)


    # # make Continuous_LowLikelihood_0 missing conditional on Continuous_HighLikelihood_0
    sim_df = remove_data.missing_conditional_continuous(sim_df, 
                                                     "Continuous_LowLikelihood_0", 
                                                     "Continuous_HighLikelihood_0", 
                                                     percent_missing=perc_missing)

    # # make Continuous_EvenLikelihood_0 missing conditional on itself
    sim_df = remove_data.missing_conditional_continuous(sim_df, 
                                                     "Continuous_EvenLikelihood_0", 
                                                     "Continuous_EvenLikelihood_0", 
                                                     percent_missing=perc_missing)

    # # make Continuous_HighLikelihood_0 missing conditional on itself
    sim_df = remove_data.missing_conditional_continuous(sim_df, 
                                                     "Continuous_HighLikelihood_0", 
                                                     "Continuous_HighLikelihood_0", 
                                                     percent_missing=perc_missing)

    # conditionally remove entries from each discrete variable
    sim_df = remove_data.missing_conditional_discrete(sim_df, 
                                                    'InCensus', perc_missing)
    sim_df = remove_data.missing_conditional_discrete(sim_df, 
                                                    'Discrete_LowLikelihood_0', perc_missing)
    sim_df = remove_data.missing_conditional_discrete(sim_df, 
                                                    'Discrete_EvenLikelihood_0', perc_missing)
    sim_df = remove_data.missing_conditional_discrete(sim_df, 
                                                    'Discrete_HighLikelihood_0', perc_missing)
    sim_df = remove_data.missing_conditional_discrete(sim_df, 
                                                    'Discrete_LowLikelihood_0', perc_missing)
    
    return sim_df

In [None]:
# test the conditionally_remove_data function
sim_conditional_missing = conditionally_remove_data(original_normalized_df, perc_missing=30)

In [None]:
# confirm that Continuous_LowLikelihood_0 is missing for those with low values of Continuous_HighLikelihood_0
sim_conditional_missing[['Continuous_HighLikelihood_0', 'Continuous_LowLikelihood_0']].sort_values("Continuous_HighLikelihood_0")[:10]

In [None]:
# check total missingness in conditional missing
conditional_total_missingness = pd.DataFrame(sim_conditional_missing).isnull().sum().sum() / pd.DataFrame(sim_conditional_missing).size
print(conditional_total_missingness)

# view missingness for features in conditional missing
sim_conditional_missing.isnull().mean()

In [None]:
# create randomly missing data matching the missingness of the conditional data
sim_random_missing = remove_data.missing_at_random(original_normalized_arr, perc_remove=conditional_total_missingness*100, rand_seed=23)

In [None]:
# check total missingness in random missing
print(pd.DataFrame(sim_random_missing).isnull().sum().sum() / pd.DataFrame(sim_random_missing).size)

# view missingness for features in random missing
pd.DataFrame(sim_random_missing, columns=original_normalized_df.columns).isnull().mean()

In [None]:
results_lst = []

for missingness_pct in range(0, 100, 5):
    sim_conditional_missing = conditionally_remove_data(original_normalized_df, perc_missing=missingness_pct)
    
    # check total missingness in conditional missing
    conditional_total_missingness = pd.DataFrame(sim_conditional_missing).isnull().sum().sum() / pd.DataFrame(sim_conditional_missing).size
    
    # create randomly missing data matching the missingness of the conditional data
    sim_random_missing = remove_data.missing_at_random(original_normalized_arr, perc_remove=conditional_total_missingness*100, rand_seed=23)

    # random missing
    sim_recovered_random_svt = matrix_completion.svt(sim_random_missing, tau=5)
    random_svt_error = la.norm(np.round(original_normalized_arr, 4) - np.round(sim_recovered_random_svt, 4), ord='fro')

#     sim_recovered_random_nnm = trunc_nnm.truncated_NNM(10, parameters, sim_random_missing)
#     random_nnm_error = la.norm(np.round(original_normalized_data, 4) - np.round(sim_recovered_random_nnm, 4), ord='fro')

    # conditional missing
    sim_recovered_conditional_svt = matrix_completion.svt(sim_conditional_missing, tau=5)
    conditional_svt_error = la.norm(np.round(original_normalized_arr, 4) - np.round(sim_recovered_conditional_svt, 4), ord='fro')

#     sim_recovered_conditional_nnm = trunc_nnm.truncated_NNM(10, parameters, sim_conditional_missing)
#     conditional_nnm_error = la.norm(np.round(original_normalized_data, 4) - np.round(sim_recovered_conditional_nnm, 4), ord='fro')

    results_lst.append([conditional_total_missingness, random_svt_error, conditional_svt_error])

In [None]:
svt_errors = pd.DataFrame(results_lst, columns=['missingness','random','conditional']).set_index('missingness')

In [None]:
svt_errors

In [None]:
errors.line_plot(svt_errors.index*100, svt_errors.random, svt_errors.conditional,
                 save=True,
                 title="Error Rate vs Type of Missingness",
                 xlabel="Percent of data missing",
                 ylabel="Reconstruction error (Frobenius norm)")

The error rate for the randomly missing data is strictly greater than the error rate for conditionally missing data. This is consistent across many randomizations. Though this finding may be particular to the data and implementation of removing values, we have a theory that helps explain this phenomenon. When entries are missing based on features within the data, the missingness has structure. In other words, being missing may indicate something about the entry (high or low value) or about another feature that is related to it. Introducing this structure into the data may help the SVT algorithm slightly by indicating subtle information about the missing entries. In contrast, entries that are missing at random do not signal anything about the missing value.

With both randomly missing data and conditionally missing data, the error rates increase quickly going from no missing data to about 10% missing. Then, the error rate increases relatively linearly.

##### How were the missing data produced?

Both the randomly missing data and the conditionally missing data were constructed from the same synthetic dataset. The randomly missing dataset was constructed by selecting random elements to drop from the data. Therefore, in expectation, each feature should have the same amount of missingness. 

The conditional data was generated by one of two processes depending on whether the feature is continuous or binary. For continuous features, values were removed probablistically based on values within the feature or values from another feature. For instance, Income values could be more likely missing if income is a low value. Alternatively, Income values could be removed based on whether age is a high or low value. For binary features, we randomly assigned different likelihoods of missingness to values of 0 and 1. Thus, values of 1 may be removed at a higher rate than values of 0.