In [1]:
#import libraries
import numpy as np

In [39]:
def load_data_column(path,column_list=(0), has_header=True):
    """Load numeric CSV with missing values -> np.nan using only numpy.
    path: path to CSV file
    column_list: list of column indices to load (default: 0)
    has_header: whether the CSV file has a header row (default: True)
    """
    skip = 1 if has_header else 0
    data = np.genfromtxt(
        path,
        delimiter=",",
        skip_header=skip,
        usecols=(column_list),  # adjust based on relevant columns]       # force float to accommodate np.nan
        missing_values=0,       # treat empty strings as missing
        filling_values=0,   # replace missing with np.nan
        autostrip=True,
        invalid_raise=False
    )
    return data


In [24]:
def load_columns_name(path):
    """Load column names from CSV header."""
    with open(path, 'r') as f:
        header = f.readline().strip()
    return header.split(',')

In [36]:
#deprecated
def get_column_index(path, column_name):
    """Get the index of a column given its name."""
    columns = load_columns_name(path)
    try:
        return columns.index(column_name)
    except ValueError:
        raise ValueError(f"Column '{column_name}' not found in {path}")

In [35]:
def get_column_index_from_list(col_list,col_names):
    """Get the index of a column given its name from a list of column names.
    col_list: list of column names to find
    col_names: list of all column names
    """
    try:
        return [col_names.index(name) for name in col_list]
    except ValueError as e:
        raise ValueError(f"One of the columns '{col_list}' not found in the provided list.") from e


In [45]:
def expand_column(col,col_name):
    '''
    Expand a 1D column vector into a N array where N is the number of unique values in col.
    Each column in the output array is a binary indicator (0 or 1) of whether the corresponding
    entry in col matches the unique value for that column.
    col: 1D numpy array of categorical values
    '''
    unique_values = np.unique(col)
    print(f"Unique values in column '{col_name}': {unique_values}")
    expanded = np.zeros((col.size, unique_values.size), dtype=int)
    for i, val in enumerate(unique_values):
        expanded[:, i] = (col == val).astype(int)
    col_names = [f"{col_name}_{val}" for val in unique_values]
    return expanded, col_names

In [33]:
col=np.array([1,2,1,3,2,1,""])
expanded_col, col_names = expand_column(col, "category")
print(col)
print(expanded_col)
print(col_names)

['1' '2' '1' '3' '2' '1' '']
[[0 1 0 0]
 [0 0 1 0]
 [0 1 0 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 1 0 0]
 [1 0 0 0]]
['category_', 'category_1', 'category_2', 'category_3']


In [None]:
#import data

x_test = load_data('../data/dataset/x_test.csv')
x_train = load_data('../data/dataset/x_train.csv')
y_train = load_data('../data/dataset/y_train.csv')

In [22]:
print(x_train)

[53. 33. 20. ... 39. 33. 32.]


In [25]:
x_col_name = load_columns_name('../data/dataset/x_test.csv')

In [26]:
print(x_col_name)

['Id', '_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE', 'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES', 'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1', 'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE', 'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH', 'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS', 'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2', 'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA', 'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3', 'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3', 'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK', 'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2', 'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5', 'MAXDRNKS', 'FRUITJU1', 'FRUIT1', 'FVBEANS', 'FVGREEN', 'FVORANG', 'VEGETAB1', 'EXERANY2', 'EXR

In [48]:
col_list = ['_PACAT1','_LMTWRK1']
col_indices = get_column_index_from_list(col_list, x_col_name)
print(col_indices)
#load x_train with only the columns in col_indices
x_train_subset = load_data_column('../data/dataset/x_train.csv', column_list=col_indices)
print(x_train_subset)
#expand the colomns in x train_subset
expanded_cols = []
expanded_col_names = []
for i, col_name in enumerate(col_list):
    expanded_col, col_names = expand_column(x_train_subset[:, i], col_name)
    expanded_cols.append(expanded_col)
    expanded_col_names.extend(col_names)
x_train_expanded = np.hstack(expanded_cols)
#add the columns name back to x_train_expanded
print(expanded_col_names)
print(x_train_expanded)
x_train_expanded_dataset = np.vstack((expanded_col_names, x_train_expanded))
#save x_train_expanded to csv
np.savetxt("x_train_expanded.csv", x_train_expanded_dataset, fmt="%s", delimiter=",")
print("x_train_expanded saved to x_train_expanded.csv")


[306, 315]
[[2. 3.]
 [9. 3.]
 [3. 2.]
 ...
 [3. 3.]
 [3. 3.]
 [9. 2.]]
Unique values in column '_PACAT1': [1. 2. 3. 4. 9.]
Unique values in column '_LMTWRK1': [0. 1. 2. 3. 9.]
['_PACAT1_1.0', '_PACAT1_2.0', '_PACAT1_3.0', '_PACAT1_4.0', '_PACAT1_9.0', '_LMTWRK1_0.0', '_LMTWRK1_1.0', '_LMTWRK1_2.0', '_LMTWRK1_3.0', '_LMTWRK1_9.0']
[[0 1 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 1 ... 1 0 0]
 ...
 [0 0 1 ... 0 1 0]
 [0 0 1 ... 0 1 0]
 [0 0 0 ... 1 0 0]]
x_train_expanded saved to x_train_expanded.csv


In [47]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold.

    Args:
        y:      shape=(N,)
        k_fold: K in K-fold, i.e. the fold num
        seed:   the random seed

    Returns:
        A 2D array of shape=(k_fold, N/k_fold) that indicates the data indices for each fold

    >>> build_k_indices(np.array([1., 2., 3., 4.]), 2, 1)
    array([[3, 2],
           [0, 1]])
    """
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval : (k + 1) * interval] for k in range(k_fold)]
    return np.array(object=k_indices, dtype=int)

In [49]:
#load x_train and y_train
x_train = load_data_column('x_train_expanded.csv', column_list=None)
y_train = load_data_column('../data/dataset/y_train.csv', column_list=None)

In [50]:
#split the dataset in 10 folds and define x_train as 9 out of 10 folds and x_test as the remaining fold, same for y_train and y_test
k_fold = 10
seed = 1
k_indices = build_k_indices(y_train, k_fold, seed)
print(k_indices)
for k in range(k_fold):
    x_test = x_train[k_indices[k]]
    y_test = y_train[k_indices[k]]
    x_train_fold = np.delete(x_train, k_indices[k], axis=0)
    y_train_fold = np.delete(y_train, k_indices[k], axis=0)
    print(f"Fold {k+1}")
    print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")
    print(f"x_train shape: {x_train_fold.shape}, y_train shape: {y_train_fold.shape}")
#print shape of x_train and y_train and x_test and y_test
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")

[[  5034 271803 120350 ... 235713 288612  41205]
 [115126 279242  14551 ...  91185 220011 312741]
 [162468 179536  84131 ... 234610 232051 217560]
 ...
 [146820 310758  72754 ... 220201  86341  37943]
 [ 37130 253130  23474 ... 325326  53369 222516]
 [268584 140632 188469 ... 293372 229520  21440]]
Fold 1
x_test shape: (32813, 10), y_test shape: (32813, 2)
x_train shape: (295322, 10), y_train shape: (295322, 2)
Fold 2
x_test shape: (32813, 10), y_test shape: (32813, 2)
x_train shape: (295322, 10), y_train shape: (295322, 2)
Fold 3
x_test shape: (32813, 10), y_test shape: (32813, 2)
x_train shape: (295322, 10), y_train shape: (295322, 2)
Fold 4
x_test shape: (32813, 10), y_test shape: (32813, 2)
x_train shape: (295322, 10), y_train shape: (295322, 2)
Fold 5
x_test shape: (32813, 10), y_test shape: (32813, 2)
x_train shape: (295322, 10), y_train shape: (295322, 2)
Fold 6
x_test shape: (32813, 10), y_test shape: (32813, 2)
x_train shape: (295322, 10), y_train shape: (295322, 2)
Fold 7
x_t

In [51]:
def calculate_loss(y, tx, w):
    """compute the cost by negative log likelihood.

    Args:
        y:  shape=(N, 1)
        tx: shape=(N, D)
        w:  shape=(D, 1)

    Returns:
        a non-negative loss (scalar)
    """

    # compute the loss: negative log likelihood
    y_hat = sigmoid(tx @ w)
    loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
    return float(loss)


def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """Logistic regression using gradient descent.

    Args:
        y: numpy array of shape (N, 1)
        tx: numpy array of shape (N, D)
        initial_w: numpy array of shape (D, 1)
        max_iters: scalar
        gamma: scalar

    Returns:
        losses: list of loss values
        ws: list of weights
    """
    ws = [initial_w]
    w = initial_w
    losses = [calculate_loss(y, tx, w)]
    for n_iter in range(max_iters):
        gradient = calculate_gradient(y, tx, w)
        w = w - gamma * gradient
        loss = calculate_loss(y, tx, w)
        ws.append(w)
        losses.append(loss)

    return ws[-1], np.asarray(losses[-1])


def sigmoid(t):
    """apply sigmoid function on t.

    Args:
        t: scalar or numpy array

    Returns:
        scalar or numpy array
    """

    return 1 / (1 + np.exp(-t))


def calculate_gradient(y, tx, w):
    """compute the gradient of loss.

    Args:
        y:  shape=(N, 1)
        tx: shape=(N, D)
        w:  shape=(D, 1)

    Returns:
        a vector of shape (D, 1)
    """

    y_hat = sigmoid(tx @ w)
    gradient = tx.T @ (y_hat - y) / y.shape[0]
    return gradient


def penalized_logistic_regression(y, tx, w, lambda_):
    """return the loss and gradient.

    Args:
        y:  shape=(N, 1)
        tx: shape=(N, D)
        w:  shape=(D, 1)
        lambda_: scalar

    Returns:
        loss: scalar number
        gradient: shape=(D, 1)
    """
    gradient = calculate_gradient(y, tx, w) + lambda_ * 2 * w
    loss = calculate_loss(y, tx, w)

    return float(loss), gradient

In [None]:
#run penalized logistic regression on x_train and y_train
initial_w = np.zeros((x_train.shape[1], 1))
max_iters = 100
gamma = 0.01
lambda_ = 0.1
loss,w = penalized_logistic_regression(y_train, x_train, initial_w, lambda_)
print(f"Final loss: {loss}")
#cross validate with x-test and y-test
y_pred = sigmoid(x_test @ w) >= 0.5
print(f"Predictions: {y_pred.flatten()}")
print(f"True labels: {y_test.flatten()}")
#compute accuracy
accuracy = np.mean(y_pred.flatten() == y_test.flatten())
print(f"Accuracy: {accuracy * 100:.2f}%")


Final loss: [[-4.86567855e+04  3.97526933e-01]
 [-2.65778995e+04  2.21741052e-01]
 [-2.66269771e+04  2.21107166e-01]
 [-4.15035332e+04  3.15589620e-01]
 [-2.07013047e+04  1.67431088e-01]
 [-9.33524484e+02  7.30339647e-03]
 [-1.56235550e+04  1.07167782e-01]
 [-3.34106005e+04  2.49287641e-01]
 [-1.08371796e+05  9.18975422e-01]
 [-5.72702441e+03  4.06616179e-02]]


ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)