In [1]:
import sys
import os

# Add the directory that contains implementations.py
sys.path.append(os.path.abspath(r"../"))

from implementations import *
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from log_helpers import *
# Paths to X and y data
X_path = '..\\data\\x_train.csv'
y_path = '..\\data\\y_train.csv'

# Load the data
X, y = load_csv_data(X_path, y_path, frac=0.01)
tx, y = clean_and_standardize(X, y)

  col_means = np.nanmean(xtest, axis=0)


In [6]:

# Set initial parameters
initial_w = np.zeros((tx.shape[1], 1))
max_iters = 100  # Number of iterations
gamma = 0.1  # Learning rate
lambda_ = 1  # Regularization parameter (lambda)


print(initial_w[:4])
# Train the model using regularized logistic regression
final_w, final_loss = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma)

print(initial_w[:4])

[[0.]
 [0.]
 [0.]
 [0.]]
[[-1.74964637e+00]
 [ 2.76557646e-03]
 [-4.33843228e-04]
 [ 6.77779976e-03]]


In [11]:
def split_data(x, y, ratio, seed=1):
    """
    Split the dataset based on the split ratio. If ratio is 0.8,
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number
    of samples is not an integer, use np.floor to determine the
    number of training samples. Also check the documentation for
    np.random.permutation, it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the training data.
        x_te: numpy array containing the testing data.
        y_tr: numpy array containing the training labels.
        y_te: numpy array containing the testing labels.

    >>> split_data(np.arange(13), np.arange(13), 0.8, 1)
    (array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]),
     array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]))
    """
    # Set the random seed for reproducibility
    np.random.seed(seed)
    
    # Get the number of samples
    N = len(y)
    
    # Generate a random permutation of indices
    shuffled_indices = np.random.permutation(N)
    
    # Calculate the number of training samples
    split_idx = int(np.floor(ratio * N))
    
    # Split the indices
    train_indices = shuffled_indices[:split_idx]
    test_indices = shuffled_indices[split_idx:]
    
    # Split the data according to the indices
    x_tr = x[train_indices]
    y_tr = y[train_indices]
    x_te = x[test_indices]
    y_te = y[test_indices]
    
    return x_tr, x_te, y_tr, y_te

In [None]:
def standardize(x):
    """Standardize the original data set column-wise."""
    mean_x = np.mean(x, axis=0)  # Compute the mean for each feature (column)
    std_x = np.std(x, axis=0)  # Compute the standard deviation for each feature (column)
    
    # Avoid division by zero by replacing zero std values with 1 (or a very small number)
    std_x[std_x == 0] = 1
    
    # Standardize each feature
    x_standardized = (x - mean_x) / std_x
    
    return x_standardized, mean_x, std_x


def clean_and_standardize (X, y):
    """
    Making a numpy array from the pandas data frames and removing nans in X
    
    Args:
    ...
    
    Return:
    ..
    
    Example:
    ..
    
    
    """
    
    # Standardisind the pandas data frame X
    X, mean_x, std_x = standardize(X)

    xtest = np.copy(X)

    # Replace nan values in the columns by the column average and set colums to 0 when there is only nan
    
    # Step 1: Identify where NaN values are
    nan_mask = np.isnan(xtest)

    # Step 2: Compute the column means, ignoring NaN values
    # This will return NaN for columns that are fully NaN
    col_means = np.nanmean(xtest, axis=0)

    # Step 3: Handle columns that are entirely NaN
    # If the entire column is NaN, np.nanmean returns NaN; we replace those NaN means with a default value, e.g., 0
    col_means = np.where(np.isnan(col_means), 0, col_means)  # Replace NaN means with 0 or any other default value

    # Step 4: Replace NaN values with the column means
    xtest[nan_mask] = np.take(col_means, np.where(nan_mask)[1])

    # Add a column of ones to include the bias
    tx = np.c_[np.ones((y.shape[0], 1)), xtest]
    
    # Convert y to a numpy array and put it in the right shape
    y = y.to_numpy()

    # Reshape y to ensure it's a column vector
    y = y.reshape(-1, 1)
    
    return tx, y

In [None]:
import pandas as pd

def load_csv_data(X_path, y_path, frac=1):
    """
    Loads X (features) and y (labels) from two CSV files, 
    converts -1, 1 targets to 0, 1 targets, and returns only a fraction of the data.
    
    Parameters:
        X_path (str): Path to the CSV file containing the X data (features).
        y_path (str): Path to the CSV file containing the y data (labels).
        frac (float): Fraction of the data to return, between 0 and 1. Default is 1.0 (100% of the data).
    
    Returns:
        X_df (pd.DataFrame): DataFrame of the feature data (X) with columns as features.
        y_df (pd.DataFrame): DataFrame of the target labels (y) with columns as labels (converted to 0 and 1).
    """
    # Load the X (features) data from the CSV file
    X_df = pd.read_csv(X_path)
    
    # Load the y (target) data from the other CSV file
    y_df = pd.read_csv(y_path)
    
    # Optional: Merge both DataFrames on 'Id' column to ensure correct alignment
    data = pd.merge(X_df, y_df, on='Id')
    
    # Select only a fraction of the data if needed
    if 0 < frac < 1:
        data = data.sample(frac=frac, random_state=42)  # Use random_state for reproducibility
    
    # Separate X (features) and y (labels) after merging
    y_df = data[['_MICHD']]
    X_df = data.drop(columns=['_MICHD'])
    
    # Convert -1, 1 targets to 0, 1 targets using .loc to avoid SettingWithCopyWarning
    y_df.loc[:, '_MICHD'] = y_df['_MICHD'].replace({-1: 0, 1: 1})
    
    return X_df, y_df