In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [2]:
# Import data
df = pd.read_sas('./data/LLCP2017.XPT')
df=df.drop(['IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'SEQNO', 'DLYOTHER'], axis=1)
df=df.fillna(0)

## Chronic Health Issues Columns

In [3]:
CHRONIC_ILL_cols=['CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'CHCSCNCR', 'CHCOCNCR','CHCCOPD1', 'HAVARTH3', 'ADDEPEV2', 'CHCKIDNY']

In [4]:
## FINDING ALL THE ROWS WHICH HAVE YES/NO VALUES FOR THE TARGET COLUMNS
TRUE_INDEX=((df[CHRONIC_ILL_cols]==1)|(df[CHRONIC_ILL_cols]==2)).all(axis=1)
df=df[TRUE_INDEX]

In [5]:
df_y=df[CHRONIC_ILL_cols]
# np.where ==> place a "0" where there is "2", otherwise, place a "1" 
df_y=np.where(df_y==2, 0, 1)
df_x=df.drop(CHRONIC_ILL_cols,axis=1)

(433867, 10)
(433867, 342)


In [6]:
bad_index_positions1 = [indx for indx, val in enumerate(np.mean(df_x, axis=0)) if val == 0]
bad_index_positions2 = [indx for indx, val in enumerate(np.std(df_x, axis = 0, ddof = 1)) if val == 0]
remove_cols = [col for i, col in enumerate(df_x.columns) if i in bad_index_positions1]
df_x=df_x.drop(df_x.columns[bad_index_positions1], axis=1)

In [7]:
# Train-Test Split
X_train, x_test, Y_train, y_test=train_test_split(df_x, df_y, test_size=0.33, random_state=42)

In [8]:
#compute mean and standard deviation of training set
Mu_train = np.mean(X_train, axis = 0)
Sigma_train = np.std(X_train, axis = 0, ddof = 1)

Mu_test = np.mean(x_test, axis = 0)
Sigma_test = np.std(x_test, axis = 0, ddof = 1)

In [9]:
X_train_stdz=(X_train-Mu_train)/Sigma_train
X_test_stdz=(x_test-Mu_test)/Sigma_test

## DROP ANY ROWS WITH NAN VALUES

In [10]:
TRAIN_NAN_ROWS = X_train_stdz.isnull().any().values.nonzero()[0]
TEST_NAN_ROWS = X_test_stdz.isnull().any().values.nonzero()[0]
Y_train=np.delete(Y_train, TRAIN_NAN_ROWS, axis=1)
y_test=np.delete(y_test, TEST_NAN_ROWS, axis=1)

X_train_stdz=X_train_stdz.dropna(axis=1)
X_test_stdz=X_test_stdz.dropna(axis=1)

# PCA

In [11]:
## USE THE PCA MODULE TO SIMPLY APPLY PCA AND TAKE CARE OF RHE COMPLEX VALUE ISSUE
sk_pca=PCA(0.95)
sk_pca.fit(X_train_stdz)
train_pca=sk_pca.transform(X_train_stdz)
test_pca=sk_pca.transform(X_test_stdz)

In [14]:
#np.save('./data/TRAIN_PCA.npy', train_pca)
#np.save('./data/TRAIN_Y.npy', Y_train)
#np.save('./data/TRAIN_FULL.npy', X_train_stdz)

In [13]:
#np.save('./data/TEST_PCA.npy', test_pca)
#np.save('./data/TEST_Y.npy', y_test)
#np.save('./data/TEST_FULL.npy', X_test_stdz)