Step 2: Base functions

In [None]:
import numpy as np

#Notice that in this section, we have more parameters than needed, with the unneeded parameters pre-set to 0.
#This is because we define a general function for the cross-validation.

def mean_square_error_gd(y,tx,initial_w,max_iters,gamma,lambda_=0):
  '''Computes the argmin/min of the quadratic function via gradient descent.
   y is an (N,) array representing the vector of outputs.
   tx is an (N,M) array representing the matrix of inputs.
   initial_w is an (M,) array representing the initial guess for the minimum.
   max_iters is an integer representing the number of iterations.
   gamma is a real number representing the step-size.
   N represents the amount of data.
   M represents the length of minimized vector.

  '''
  N=y.shape[0]
  w=initial_w.copy()
  e=y-tx @ initial_w
  for i in range(max_iters):
    e=y-tx @ w
    grad=- (tx.T @ e.copy()) / N
    w-=gamma*grad
  e=y-tx @ w
  loss=np.linalg.norm(e)**2/(2*N)
  return w,loss

def mean_square_error_sgd(y,tx,initial_w,max_iters,gamma,lambda_=0):
  '''Computes the argmin/min of the quadratic function via stochastic gradient descent.
   y is an (N,) array representing the vector of outputs.
   tx is an (N,M) array representing the matrix of inputs.
   initial_w is an (M,) array representing the initial guess for the minimum.
   max_iters is an integer representing the number of iterations.
   gamma is a real number representing the step-size.
   N represents the amount of data.
   M represents the length of minimized vector.

  '''
  N=y.shape[0]
  w=initial_w.copy()
  coefs=np.random.choice(N,max_iters)
  e=y-tx @ initial_w
  for i in range(max_iters):
    j=coefs[i]
    e=y-tx @ w
    stoch_grad=- (tx[j,:].T*e[j])
    w-=gamma*stoch_grad
  e=y-tx @ w
  loss=np.linalg.norm(e)**2/(2*N)
  return w,loss

def least_squares(y,tx,initial_w=0,max_iters=0,gamma=0,lambda_=0):
  '''Computes the argmin/min of the quadratic function via normal equations of least squares problem.
   y is an (N,) array representing the vector of outputs.
   tx is an (N,M) array representing the matrix of inputs.
   N represents the amount of data.
   M represents the length of minimized vector.

  '''
  N=y.shape[0]
  w=np.linalg.solve(tx.T @ tx, tx.T @ y)
  e=y-tx @ w
  loss=np.linalg.norm(e)**2/(2*N)
  return w,loss

def ridge_regression(y,tx,lambda_,initial_w=0,max_iters=0,gamma=0):
  '''Computes the argmin/min of the L2-regularized quadratic function via normal equations.
   y is an (N,) array representing the vector of outputs.
   tx is an (N,M) array representing the matrix of inputs.
   lambda_ is a real number representing the regularization coefficient.
   N represents the amount of data.
   M represents the length of minimized vector.

  '''
  N=y.shape[0]
  M=tx.shape[1]
  w=np.linalg.solve(tx.T @ tx + 2*N*lambda_*np.eye(M), tx.T @ y)
  e=y-tx @ w
  loss=np.linalg.norm(e)**2/(2*N)
  return w,loss

def sigmoid(x):
  '''Computes the sigmoid function.

  '''
  return 1/(1+np.exp(-x))

def logistic_regression(y, tx, initial_w, max_iters, gamma, lambda_=0):
  '''Computes the argmin/min of the logistic function via gradient descent.
   y is an (N,) array representing the vector of outputs.
   tx is an (N,M) array representing the matrix of inputs.
   initial_w is an (M,) array representing the initial guess for the minimum.
   max_iters is an integer representing the number of iterations.
   gamma is a real number representing the step-size.
   N represents the amount of data.
   M represents the length of minimized vector.

  '''
  N=y.shape[0]
  w=initial_w.copy()
  for i in range(max_iters):
    grad=0
    for j in range(N):
      grad+=(sigmoid(tx[j,:].T @ w)-y[j])*tx[j,:]
    grad/=N
    w-=gamma*grad
  loss=0
  for j in range(N):
    w_pert=tx[j,:] @ w
    loss+=-y[j]*(w_pert)+np.log(1+np.exp(w_pert))
  loss/=N
  return w,loss

def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma):
  '''Computes the argmin/min of the L2-regularized logistic function via gradient descent.
   y is an (N,) array representing the vector of outputs.
   tx is an (N,M) array representing the matrix of inputs.
   lambda_ is a real number representing the regularization coefficient.
   initial_w is an (M,) array representing the initial guess for the minimum.
   max_iters is an integer representing the number of iterations.
   gamma is a real number representing the step-size.
   N represents the amount of data.
   M represents the length of minimized vector.

  '''
  N=y.shape[0]
  w=initial_w.copy()
  for i in range(max_iters):
    grad=0
    for j in range(N):
      grad+=(sigmoid(tx[j,:].T @ w)-y[j])*tx[j,:]
    grad/=N
    grad+=2*lambda_*w
    w-=gamma*grad
  loss=0
  for j in range(N):
    w_pert=tx[j,:] @ w
    loss+=-y[j]*(w_pert)+np.log(1+np.exp(w_pert))
  loss/=N
  return w,loss

Cross-validation function

In [None]:
def square_label(tx,w):
  '''Returns the outputs based on the quadratic model.
  tx is an (N,M) array representing the matrix of inputs.
  w is an (M,) array representing the minimum of the quadratic function.

  '''
  return tx @ w

def logistic_label(tx,w):
  '''Returns the outputs based on the logistic model.
  tx is an (N,M) array representing the matrix of inputs.
  w is an (M,) array representing the minimum of the quadratic function.

  '''
  return np.round(sigmoid(tx @ w))

def train_score_square(y_test, y_ML):
  '''Computes the L2,L1 scores of the quadratic model.
  y_test is an (N,) Boolean vector representing the true labels.
  y_ML is an (N,) Boolean vector representing the predicted labels.

  '''
  N=y_test.shape[0]
  return np.mean((y_test-y_ML)**2),np.mean(np.abs(y_test-y_ML))

def train_score_logistic(y_test, y_ML):
  '''Computes the probabilistic score and the F1 score of the logistic model.
  y_test is an (N,) Boolean vector representing the true labels.
  y_ML is an (N,) Boolean vector representing the predicted labels.

  '''
  N=y_test.shape[0]
  P=len(np.argwhere(np.logical_and(y_ML==1,y_test==1)))/len(np.argwhere(y_test==1))
  R=len(np.argwhere(np.logical_and(y_ML==1,y_test==1)))/len(np.argwhere(y_ML==1))
  return np.mean(np.abs(y_test-y_ML)),2*P*R/(P+R)

def cross_val(mach_learn,data_repr,score_func,tx,y,folds,max_iters=0,gamma=0,lambda_=0):
  '''Cross-validation of the model. Returns two different model scores.
  mach_learn - function representing the utilised method of machine learning.
  data_repr - function representing the data.
  score_func - function representing the score.
  tx - (N,M) array representing the matrix of N input vectors.
  y - (N,) array representing the vector of N outputs.
  folds - integer representing the number of folds.
  max_iters - integer representing the training time on each fold.
  gamma is a real number representing the step-size.

  '''
  N,M=tx.shape
  fold_size=N//folds
  sf=np.random.permutation(N)
  scores=np.ones(folds)
  alt_scores=np.ones(folds)
  for k in range(folds):
    train_data=tx[np.append(sf[:k*fold_size],sf[(k+1)*fold_size:])].copy()
    train_labels=y[np.append(sf[:k*fold_size],sf[(k+1)*fold_size:])].copy()
    test_data=tx[sf[k*fold_size:(k+1)*fold_size]].copy()
    test_labels=y[sf[k*fold_size:(k+1)*fold_size]].copy()
    w=mach_learn(y=train_labels,tx=train_data,initial_w=np.zeros(M),max_iters=max_iters,gamma=gamma,lambda_=lambda_)[0]
    pred_labels=data_repr(test_data,w)
    scores[k],alt_scores[k]=score_func(test_labels,pred_labels)
  return np.mean(scores),np.mean(alt_scores)

Loading data

In [None]:
from helpers import load_csv_data

x_train, x_test, y_train,_,_= load_csv_data('/content/')

Removing the medically insignificant data (e.g. - telephone number)

In [None]:
x_train1=x_train.copy()
x_test1=x_test.copy()

#Removing the first 24 columns excluding the "Adult" and the "Gender" columns
x_train=np.array([x_train1[:,15],x_train1[:,16],x_train1[:,17]]).T
x_train=np.append(x_train.copy(),x_train1[:,24:],axis=1)
x_test=np.array([x_test1[:,15],x_test1[:,16],x_test1[:,17]]).T
x_test=np.append(x_test.copy(),x_test1[:,24:],axis=1)

Data cleaning

In [None]:
#Here, we compute the col_list of columns that contain at least 80% of information in x_train.
#sortd1, sortd2 are the sorted versions of columns of x_train, x_test respectively.
#sortd are used to remove outliers - everything under the 1-st and above the 99-th percentile.
#After removing the outliers, we check if the column is constant, in which case we don't include it into col_list.
#Otherwise, we substitute the nans by the mean of the known labels.
#Finally, we subtract the columnwise mean and divide by columnwise std to normalize x_train and x_test.
col_list=[]
for i in range(x_train.shape[1]):
  n=np.count_nonzero(np.isnan(x_train[:,i]))/x_train.shape[0]
  if n<0.2:
    sortd1=np.sort(x_train[np.argwhere(~np.isnan(x_train[:,i])),i].reshape(-1))
    x_train[np.argwhere(x_train[:,i]>sortd1[int(0.99*len(sortd1))]),i]=np.nan
    x_train[np.argwhere(x_train[:,i]<sortd1[int(0.01*len(sortd1))]),i]=np.nan
    sortd2=np.sort(x_test[np.argwhere(~np.isnan(x_test[:,i])),i].reshape(-1))
    x_test[np.argwhere(x_test[:,i]>sortd2[int(0.99*len(sortd2))]),i]=np.nan
    x_test[np.argwhere(x_test[:,i]<sortd2[int(0.01*len(sortd2))]),i]=np.nan
    if np.nanstd(x_train[:,i])==0:
      pass
    else:
      x_train[:,i] = np.nan_to_num(x_train[:,i], nan=np.nanmean(x_train[:,i]))
      x_test[:,i] = np.nan_to_num(x_test[:,i], nan=np.nanmean(x_test[:,i]))
      col_list.append(i)
  else:
    pass
x_train_mod=x_train[:,col_list]
x_test_mod=x_test[:,col_list]
# Feature Scaling (manually, if required)
mean_x = np.mean(x_train_mod, axis=0)
std_x = np.std(x_train_mod, axis=0)
mean_test_x = np.mean(x_test_mod, axis=0)
std_test_x = np.std(x_test_mod, axis=0)
x_train_scaled = (x_train_mod - mean_x) / std_x
tx_train = np.c_[np.ones((x_train_scaled.shape[0], 1)), x_train_scaled]
x_test_scaled = (x_test_mod - mean_test_x) / std_test_x
tx_test = np.c_[np.ones((x_test_scaled.shape[0], 1)), x_test_scaled]

NameError: name 'gamma' is not defined

Testing different parameters

In [None]:
#Here, we are testing the cross-validation and F1 scores for different gammas.
cross_score=list()
F1_score=list()
gamma=[0.01,0.011,0.012,0.013,0.014,0.015,0.016,0.017,0.018,0.019,0.02]
for i in range(11):
  np.random.seed(0)
  c,f1=cross_val(reg_logistic_regression,logistic_label,train_score_logistic,tx_train,(1+y_train)/2,3,100,gamma[i],0.02) #(1+y_train)/2 is used to rescale {-1,1} to {0,1}
  cross_score.append(1-c) #c represents a probability of failure, to compute the score we compute 1-c
  F1_score.append(f1) #f1 represents the F1-score
print(cross_score, '\n', F1_score)

0.5948136142625607
0.5702746365105008
0.5611745513866232
0.5814332247557004
0 0.9128521448685285
0.568259385665529
0.5573248407643312
0.6143790849673203
0.5711920529801324
1 0.9128490973144954
0.594896331738437
0.5607142857142857
0.5721153846153846
0.5763888888888888
2 0.9128094791120647
0.5525040387722132
0.5788561525129983
0.5694682675814752
0.6111111111111112
3 0.9127972888959321
0.5182829888712241
0.5730337078651685
0.6087751371115173
0.6037414965986394
4 0.9127393853693025
0.5722222222222222
0.5825688073394495
0.5547226386806596
0.5339673913043478
5 0.9127180524910706
0.517931609674729
0.528526148969889
0.5321027287319422
0.5276845637583892
6 0.9124925334926188
0.45659377070907886
0.45
0.4368421052631579


KeyboardInterrupt: 

Testing different parameters

In [None]:
#Here, we are testing the cross-validation and F1 scores for different gammas.
cross_score=list()
F1_score=list()
gamma=[0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1]
for i in range(10):
  np.random.seed(0)
  c,f1=cross_val(reg_logistic_regression,logistic_label,train_score_logistic,tx_train,(1+y_train)/2,3,100,gamma[i],0.02) #(1+y_train)/2 is used to rescale {-1,1} to {0,1}
  cross_score.append(1-c) #c represents a probability of failure, to compute the score we compute 1-c
  F1_score.append(f1) #f1 represents the F1-score
print(cross_score, '\n', F1_score)

Testing different parameters

In [None]:
#Here, we are testing the cross-validation and F1 scores for different lambdas.
cross_score=list()
F1_score=list()
lambda_=[0,0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1]
for i in range(11):
  np.random.seed(0)
  c,f1=cross_val(reg_logistic_regression,logistic_label,train_score_logistic,tx_train,(1+y_train)/2,3,100,0.013,lambda_[i]) #(1+y_train)/2 is used to rescale {-1,1} to {0,1}
  cross_score.append(1-c) #c represents a probability of failure, to compute the score we compute 1-c
  F1_score.append(f1) #f1 represents the F1-score
print(cross_score, '\n', F1_score)

Plotting the results

In [None]:
#Here, we plot the cross-validation score and the F1-score together in adjacent subplots for the experiments above
import matplotlib.pyplot as plt
cross_score_list=[[0.843634003181627, 0.8508261868626841, 0.8573509602784228, 0.8631290875069332, 0.8681605685482151, 0.8728568206891087, 0.8771325129367881, 0.8808566012665557, 0.8842149853413545, 0.8870095753564092, 0.889599980495773],
                  [0.7494072543534044, 0.7620484314334998, 0.7977228815057263, 0.843634003181627, 0.889599980495773, 0.9124839242504587, 0.914044262404993, 0.9142911127770972, 0.9143124455253037, 0.9120725069636185],
                  [0.8641286791371817, 0.8640799185698526, 0.8640250629316072, 0.8638757336941615, 0.8636166931802252, 0.8631290875069332, 0.861620557455186, 0.8593836664289589, 0.8542881871430574, 0.8390291771044756, 0.8174282457776396]]
F1_score_list=[[0.39346228813457146, 0.39566160835252323, 0.39792074757988366, 0.39802134182007315, 0.39745122215939044, 0.39702857839874933, 0.39578429371577006, 0.39352459256389194, 0.3907926662959345, 0.3874109250728939, 0.3840502865308115],
               [0.35098294372024075, 0.35762450313346944, 0.3749460774465487, 0.39346228813457146, 0.3840502865308115, 0.2682332940878564, 0.21131586043088726, 0.19528480572876492, 0.19208448488324978, 0.18962007390983215],
               [0.39805835873418943, 0.39805405144621225, 0.39799031636393406, 0.3979562412143844, 0.3979542384452552, 0.39802134182007315, 0.39798104474211254, 0.3982652682394514, 0.3972691766847049, 0.3921244685464566, 0.38365032295726875]]
varying_list=[[0.01,0.011,0.012,0.013,0.014,0.015,0.016,0.017,0.018,0.019,0.02],
              [0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1],
              [0,0.001,0.002,0.005,0.01,0.02,0.05,0.1,0.2,0.5,1]]
title_list=['max_iters=100, lambda_=0.02, gamma varying', 'max_iters=100, lambda_=0.02, gamma varying', 'max_iters=100, gamma=0.013, lambda_ varying']
x_label_list=['gamma','gamma','lambda_']
for i in range(3):
  fig,(ax1,ax2)=plt.subplots(2,1)
  ax1.plot(varying_list[i],cross_score_list[i])
  ax2.plot(varying_list[i],F1_score_list[i])
  ax1.set_xlabel(x_label_list[i])
  ax1.set_xscale('log')
  ax1.set_ylabel('Cross-validation score')
  ax1.set_title(title_list[i])
  ax2.set_xlabel(x_label_list[i])
  ax2.set_xscale('log')
  ax2.set_ylabel('F1 score')
  plt.show()


Creating CSV submission

In [None]:
np.random.seed(0)
from helpers import create_csv_submission
w_pred=reg_logistic_regression((1+y_train)/2,tx_train,0.02,np.zeros(tx_train.shape[1]),100,0.013)[0] #(1+y_train)/2 is used to rescale {-1,1} to {0,1}
y_pred=logistic_label(tx_test,w_pred)
create_csv_submission(np.arange(328135,437514),2*y_pred-1,'submission.csv') #2*y_pred - 1 is used to rescale {0,1} to {-1,1}