## Problem set 5

**Problem 0** (-2 points for every missing green OK sign. If you don't run the cell below, that's -14 points.)

Make sure you are in the DATA1030 environment.

In [1]:
from __future__ import print_function
from distutils.version import LooseVersion as Version
import sys

OK = '\x1b[42m[ OK ]\x1b[0m'
FAIL = "\x1b[41m[FAIL]\x1b[0m"

try:
    import importlib
except ImportError:
    print(FAIL, "Python version 3.7 is required,"
                " but %s is installed." % sys.version)

def import_version(pkg, min_ver, fail_msg=""):
    mod = None
    try:
        mod = importlib.import_module(pkg)
        if pkg in {'PIL'}:
            ver = mod.VERSION
        else:
            ver = mod.__version__
        if Version(ver) == min_ver:
            print(OK, "%s version %s is installed."
                  % (lib, min_ver))
        else:
            print(FAIL, "%s version %s is required, but %s installed."
                  % (lib, min_ver, ver))    
    except ImportError:
        print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
    return mod


# first check the python version
pyversion = Version(sys.version)
if pyversion >= "3.7":
    print(OK, "Python version is %s" % sys.version)
elif pyversion < "3.7":
    print(FAIL, "Python version 3.7 is required,"
                " but %s is installed." % sys.version)
else:
    print(FAIL, "Unknown Python version: %s" % sys.version)

    
print()
requirements = {'numpy': "1.18.5", 'matplotlib': "3.2.2",'sklearn': "0.23.1", 
                'pandas': "1.0.5",'xgboost': "1.1.1", 'shap': "0.35.0"}

# now the dependencies
for lib, required_version in list(requirements.items()):
    import_version(lib, required_version)

[42m[ OK ][0m Python version is 3.7.6 | packaged by conda-forge | (default, Jun  1 2020, 18:33:30) 
[Clang 9.0.1 ]

[42m[ OK ][0m numpy version 1.18.5 is installed.
[42m[ OK ][0m matplotlib version 3.2.2 is installed.
[42m[ OK ][0m sklearn version 0.23.1 is installed.
[42m[ OK ][0m pandas version 1.0.5 is installed.
[42m[ OK ][0m xgboost version 1.1.1 is installed.
[42m[ OK ][0m shap version 0.35.0 is installed.


**Problem 1** (14 points)

You will implement a simple version of Little's test. The backbone of the function is provided below. Please work through and test each step as indicated in the comments and run the function on a couple of dummy datasets located in the `/data` folder.

**NOTE1:** Step 1 is at the end of the cell outside of the `simple_Little_test` function. All other steps are inside the function.

**NOTE2:** DO NOT USE THIS FUNCTION IN ANY SERIOUS PROJECT! This exercise is limited to the case when only one feature has missing values. If your dataset has missing values in multiple features, the function won't give accurate results. Please read [Little's paper](https://www.tandfonline.com/doi/abs/10.1080/01621459.1988.10478722) if you want to implement the general approach. 

In [2]:
# first I generate the datasets they will test and import MCAR and MAR on one of the features
from sklearn.datasets import make_classification
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

np.random.seed(42)

# MAR
n_samples = 300
X, y = make_classification(n_samples=n_samples,n_features=2, n_redundant=0, n_informative=2,
                           random_state=3, n_clusters_per_class=1,class_sep=0.5,weights=[0.7,0.3])

xlim = [np.min(X[:, 0]),np.max(X[:, 0])]
ylim = [np.min(X[:, 1]),np.max(X[:, 1])]

f1_scaled = (MinMaxScaler().fit_transform(2**X[:, 0].reshape(-1, 1))[:,0])**2
indcs = np.random.choice(np.arange(n_samples),size=int(n_samples*0.4),replace=False,p=f1_scaled/np.sum(f1_scaled))
X_ampute = np.copy(X)
X_ampute[indcs,1] = np.nan 

df = pd.DataFrame()
df['feature 1'] = X_ampute[:,0]
df['feature 2'] = X_ampute[:,1]
df['label'] = y
df.to_csv('data/data1.csv',index=False)

print(df.head())

# MCAR
n_samples = 200
X, y = make_classification(n_samples=n_samples,n_features=4, n_redundant=0, n_informative=2,
                           random_state=10, n_clusters_per_class=1,class_sep=0.5)


indcs = np.random.choice(np.arange(n_samples),size=int(n_samples/3),replace=False)
X_ampute = np.copy(X)
X_ampute[indcs,2] = np.nan

df = pd.DataFrame()
df['feature 1'] = X_ampute[:,0]
df['feature 2'] = X_ampute[:,1]
df['feature 3'] = X_ampute[:,2]
df['feature 4'] = X_ampute[:,3]
df['label'] = y
df.to_csv('data/data2.csv',index=False)

print(df.head())

# another MAR 
n_samples = 200
X, y = make_classification(n_samples=n_samples,n_features=6, n_redundant=0, n_informative=3,
                           random_state=3, n_clusters_per_class=1,class_sep=0.5,weights=[0.7,0.3])

xlim = [np.min(X[:, 0]),np.max(X[:, 0])]
ylim = [np.min(X[:, 1]),np.max(X[:, 1])]

f1_scaled = (MinMaxScaler().fit_transform(2**np.abs(X[:, 0]).reshape(-1, 1))[:,0])**2
indcs = np.random.choice(np.arange(n_samples),size=int(n_samples*0.4),replace=False,p=f1_scaled/np.sum(f1_scaled))
X_ampute = np.copy(X)
X_ampute[indcs,4] = np.nan 

df = pd.DataFrame()
df['feature 1'] = X_ampute[:,0]
df['feature 2'] = X_ampute[:,1]
df['feature 3'] = X_ampute[:,2]
df['feature 4'] = X_ampute[:,3]
df['feature 5'] = X_ampute[:,4]
df['feature 6'] = X_ampute[:,5]
df['label'] = y
df.to_csv('data/data3.csv',index=False)

print(df.head())

# last dataset with two features containing missing values
n_samples = 1000
X, y = make_classification(n_samples=n_samples,n_features=10, n_redundant=0, n_informative=6,
                           random_state=10, n_clusters_per_class=1,class_sep=0.5)


indcs = np.random.choice(np.arange(n_samples),size=int(n_samples/3),replace=False)
indcs2 = np.random.choice(np.arange(n_samples),size=int(n_samples/3),replace=False)
X_ampute = np.copy(X)
X_ampute[indcs,1] = np.nan
X_ampute[indcs2,6] = np.nan

df = pd.DataFrame()
df['feature 1'] = X_ampute[:,0]
df['feature 2'] = X_ampute[:,1]
df['feature 3'] = X_ampute[:,2]
df['feature 4'] = X_ampute[:,3]
df['feature 5'] = X_ampute[:,4]
df['feature 6'] = X_ampute[:,5]
df['feature 7'] = X_ampute[:,6]
df['feature 8'] = X_ampute[:,7]
df['feature 9'] = X_ampute[:,8]
df['feature 10'] = X_ampute[:,9]
df['label'] = y
df.to_csv('data/data4.csv',index=False)

print(df.head())


   feature 1  feature 2  label
0   1.308443        NaN      1
1   0.579018   0.445624      0
2   0.659703        NaN      0
3   1.195591        NaN      0
4   0.530409        NaN      0
   feature 1  feature 2  feature 3  feature 4  label
0  -0.053636   0.631832        NaN   0.830836      0
1   0.707561  -1.265775   1.184195   0.268585      0
2  -0.402403  -2.313096        NaN   0.051042      0
3   1.348075  -1.366191  -0.172210   0.278011      0
4   1.611499  -0.082858        NaN   0.216504      1
   feature 1  feature 2  feature 3  feature 4  feature 5  feature 6  label
0  -1.475149   0.529030   0.557537   3.819145        NaN  -0.959875      0
1   0.072969  -0.397754  -0.980947   0.311793   0.646965   1.014624      0
2   0.809812   1.036716   1.329798  -0.705942        NaN  -0.170632      1
3  -0.067652   1.507020  -1.895661   0.977815   0.683140   0.760448      0
4  -0.561700   0.589722   1.529195   1.182259        NaN   1.290034      0
   feature 1  feature 2  feature 3  feature 4 

In [3]:
# feel free to import other packages if necessary
import numpy as np
import pandas as pd
# remove this line:
from sklearn.feature_selection import f_classif

def simple_Little_test(df, p_crit = 0.05):
    # the input to the function is a pandas dataframe and a critical p_value which by default is set to 0.05.
    
    # this is the variable the function will return
    # True if the missingness pattern correlates with any of the other columns (indicates missing at random)
    # False if there is no correlation (it could be MCAR or MNAR)
    # we set it to false for now so the function has a valid but potentially incorrect output
    # we will update the value of the variable later
    MAR_is_present = False
    
    # step 2: 3 points
    # identify which columns(s) have the missing values
    # if there are multiple columns with missing values, raise a ValueError 
    # and return a message saying 'Multiple columns contain missing values, this function should not be used.'
    cols_with_nan = df.columns[df.isna().any()].tolist()
    #print(cols_with_nan)
    if len(cols_with_nan) > 1:
        raise ValueError('Multiple columns contain missing values, this function should not be used.')
    
    # step 3: 2 points
    # now that you identified the column with missing values, create a new column called 'mask' 
    # its value is 1 (or True) if an element of the column is NaN, its value is 0 (or False) otherwise.
    df['mask'] = pd.isnull(df[cols_with_nan[0]])
    #print(df.head())
    
    # step 4: 2 points
    # collect the names of all columns that do not contain missing values and exclude the mask column
    cols = list(df.columns)
    cols.remove(cols_with_nan[0])
    cols.remove('mask')
    #print(cols)
    
    # step 5: 3 points
    # decide which method to use to look for linear correlations between col and the mask.
    # hint: assume that mask is your target variable.
    # collect the p_values from the method
    f, p_val = f_classif(df[cols],df['mask'])
    #print(p_val)
        
        
    # step 6: 2 points
    # if any of the p_values are smaller than p_crit, MAR is present in the dataset
    # change the value of MAR_is_present to true
    # if all p_values are larger than critical, MAR_is_present can remain false.
    if np.any(p_val < p_crit):
        MAR_is_present = True
    else:
        MAR_is_present = False
    
    return MAR_is_present

# step 1: 2 points
# read in the four datasets in the `/data` folder and apply the function on each of them
# print out what the function returns
# initialally it will just return False four times

for i in range(1,5):
    df = pd.read_csv('data/data'+str(i)+'.csv')
    print(simple_Little_test(df))


True
False
True


ValueError: Multiple columns contain missing values, this function should not be used.

**Problem 2** (16 points)

Consider the [hand postures dataset](https://archive.ics.uci.edu/ml/datasets/Motion+Capture+Hand+Postures). You saw in the previous problem sets that it contains a large amount of missing values. Describe why you would or wouldn't use the techniques in the cells below to handle the missing values in this dataset. Feel free to use a mix of code and text answers to support your argument. The csv file is location in the data folder.

**Drop the columns with missing values:**


**Drop the rows with missing values:**


**Mean or median imputation:**


**Multivariate imputation:**

**Grading suggestion:** 4 points per technique

None of these techniques should be used with the hand postures dataset.

Drop cols: one would need to drop all but the X1,Y1,Z1,X2,Y2,Z2 columns.

Drop rows: one would need to drop 99.96% of the rows

Mean and median imputation: always a bad idea

Multivariate imputation: doesn't work here based on the description. "The 11 markers not part of the rigid pattern were unlabeled; their positions were not explicitly tracked. Consequently, there is no a priori correspondence between the markers of two given records." So marker *i* for one user might be a different marker for a different user.
    

In [4]:
import pandas as pd
df = pd.read_csv('data/Postures.csv')
df.drop([0],inplace=True)
df.head()
df.replace('?',np.nan,inplace=True)
print('data dimensions:',df.shape)
perc_missing_per_ftr = df.isnull().sum(axis=0)/df.shape[0]
print('fraction of missing values in features:')
print(perc_missing_per_ftr[perc_missing_per_ftr > 0])
print('data types of the features with missing values:')
print(df[perc_missing_per_ftr[perc_missing_per_ftr > 0].index].dtypes)
frac_missing = sum(df.isnull().sum(axis=1)!=0)/df.shape[0]
print('fraction of points with missing values:',frac_missing)

data dimensions: (78095, 38)
fraction of missing values in features:
X3     0.008835
Y3     0.008835
Z3     0.008835
X4     0.039951
Y4     0.039951
Z4     0.039951
X5     0.166758
Y5     0.166758
Z5     0.166758
X6     0.330981
Y6     0.330981
Z6     0.330981
X7     0.501338
Y7     0.501338
Z7     0.501338
X8     0.608643
Y8     0.608643
Z8     0.608643
X9     0.693105
Y9     0.693105
Z9     0.693105
X10    0.811102
Y10    0.811102
Z10    0.811102
X11    0.999603
Y11    0.999603
Z11    0.999603
dtype: float64
data types of the features with missing values:
X3     object
Y3     object
Z3     object
X4     object
Y4     object
Z4     object
X5     object
Y5     object
Z5     object
X6     object
Y6     object
Z6     object
X7     object
Y7     object
Z7     object
X8     object
Y8     object
Z8     object
X9     object
Y9     object
Z9     object
X10    object
Y10    object
Z10    object
X11    object
Y11    object
Z11    object
dtype: object
fraction of points with missing values: 0.99