In [1]:
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
from matplotlib import style
style.use('dark_background')

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics

import util

In [2]:
df = pd.read_csv('features/feature_train.csv')
train_df = pd.read_csv('resources/File_train.csv')
feature_col = df.drop(labels=['apn', 'group', 'file'], axis=1).columns
label_col = 'apn'

# 1. L1 regularization
    For Logistic Regression
    - The baseline model shows a 3% difference between training and validation accuracy
    - With L1 regularization the difference reduced to 2%, but overall accuracy decreased by 6%
    For Linear SVC
    - The baseline model performs worse than Logistic Regression
    - With L1 regularization, the performance get improved
    - Convergence issue is severe, so the findings above might not be accurate

In [None]:
# Baseline model
logreg_1 = LogisticRegression(solver='lbfgs', max_iter=1e6)
acc_train, acc_val, _ = model_evaluation_CV(logreg_1, df, train_df, feature_col)
print(f'Baseline performance: {acc_train:.3f} for training, {acc_val:.3f} for validation')

# Baseline model + L1 (C=0.01)
logreg_2 = LogisticRegression(C=0.01, penalty="l1", dual=False, solver='saga', max_iter=1e6)
acc_train, acc_val, _ = model_evaluation_CV(logreg_2, df, train_df, feature_col)
print(f'With L1 (C=0.01): {acc_train:.3f} for training, {acc_val:.3f} for validation')

# Baseline model + L1 (C=1)
logreg_3 = LogisticRegression(C=1, penalty="l1", dual=False, solver='saga', max_iter=1e6)
acc_train, acc_val, _ = model_evaluation_CV(logreg_3, df, train_df, feature_col)
print(f'With L1 (C=1): {acc_train:.3f} for training, {acc_val:.3f} for validation')

# SVC baseline
lsvc = LinearSVC(max_iter=1e6)
acc_train, acc_val, _ = model_evaluation_CV(lsvc, df, train_df, feature_col)
print(f'Baseline for Linear SVC: {acc_train:.3f} for training, {acc_val:.3f} for validation')

# SVC (C=0.01)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=1e6)
acc_train, acc_val, _ = model_evaluation_CV(lsvc, df, train_df, feature_col)
print(f'Linear SVC (C=0.01): {acc_train:.3f} for training, {acc_val:.3f} for validation')


In [1]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
logreg.predict()

# 2. With normalization
    Normalization mitigates convergence issue
    For different models and hyperparameters, validation accuracy is ~0.82

In [3]:
# Baseline model
logreg_1 = LogisticRegression(solver='lbfgs', max_iter=1e6)
acc_train, acc_val = util.model_evaluation_CV(logreg_1, df, train_df, feature_col, normalize=True)
print(f'Baseline performance: {acc_train:.3f} for training, {acc_val:.3f} for validation')

# Baseline model + L1 (C=0.01)
logreg_2 = LogisticRegression(C=0.01, penalty="l1", dual=False, solver='saga', max_iter=1e6)
acc_train, acc_val = util.model_evaluation_CV(logreg_2, df, train_df, feature_col, normalize=True)
print(f'With L1 (C=0.01): {acc_train:.3f} for training, {acc_val:.3f} for validation')

# Baseline model + L1 (C=1)
logreg_3 = LogisticRegression(C=1, penalty="l1", dual=False, solver='saga', max_iter=1e6)
acc_train, acc_val = util.model_evaluation_CV(logreg_3, df, train_df, feature_col, normalize=True)
print(f'With L1 (C=1): {acc_train:.3f} for training, {acc_val:.3f} for validation')

# SVC baseline
lsvc = LinearSVC(max_iter=1e6)
acc_train, acc_val = util.model_evaluation_CV(lsvc, df, train_df, feature_col, normalize=True)
print(f'Baseline for Linear SVC: {acc_train:.3f} for training, {acc_val:.3f} for validation')

# SVC (C=0.01)
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=1e6)
acc_train, acc_val = util.model_evaluation_CV(lsvc, df, train_df, feature_col, normalize=True)
print(f'Linear SVC (C=0.01): {acc_train:.3f} for training, {acc_val:.3f} for validation')


Baseline performance: 0.858 for training, 0.827 for validation
With L1 (C=0.01): 0.843 for training, 0.818 for validation
With L1 (C=1): 0.858 for training, 0.827 for validation
Baseline for Linear SVC: 0.858 for training, 0.827 for validation
Linear SVC (C=0.01): 0.853 for training, 0.828 for validation


# z
# Test L1 regularization (feature selection) with different models and hyperparameters
X, y = df[feature_col], df[label_col]
print(X.shape)

file_train, file_val = train_test_split(train_df['file'], test_size=0.2, stratify=train_df['group'], random_state=123)
X_train, X_val = df.loc[df.file.isin(file_train), feature_col], df.loc[df.file.isin(file_val), feature_col]
y_train, y_val = df.loc[df.file.isin(file_train), label_col], df.loc[df.file.isin(file_val), label_col]

logreg_1 = LogisticRegression(C=0.01, penalty="l1", dual=False, solver='saga', max_iter=1e4).fit(X_train, y_train)
print(logreg_1.score(X_train, y_train))
print(logreg_1.score(X_val, y_val))
model = SelectFromModel(logreg_1, prefit=True)
X_new = model.transform(X)
print(X_new.shape)
print(X.columns[np.invert(model.get_support())])

logreg_2 = LogisticRegression(C=1, penalty="l1", dual=False, solver='saga', max_iter=1e4).fit(X_train, y_train)
model = SelectFromModel(logreg_2, prefit=True)
print(logreg_2.score(X_train, y_train))
print(logreg_2.score(X_val, y_val))
X_new = model.transform(X)
print(X_new.shape)
print(X.columns[np.invert(model.get_support())])

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=1e4).fit(X_train, y_train)
print(lsvc.score(X_train, y_train))
print(lsvc.score(X_val, y_val))
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print(X_new.shape)
print(X.columns[np.invert(model.get_support())])