Whats done in this notebook:

- Encoding target label with LabelEncoder to be 0, 1, 2
- Making dummies out of "Group" feature
- Scaling numerical features with StandardScaler
- Ran RandomForest, KNN, Catboost(catboost needs debugging, and training on gpu because the grid is too big for my laptop)

- Then decided to reduce dimensionality because too many features that have low impact on target (refer to correlation map)
- Ran PCA analysis to retrieve number of features to leave after dimensionality reduction.
After getting this number (60), reduced dimensionality of data with PCA to 60 features


Questions to ask: what to do with target? Does it make sense to have as 0, 1, 2? Or should we make a dummy vector out of it? What should we use as target metric when training? Can we use scorer given in Hackathon description?

In [1]:
#TODO debug scoring metric to correspond with hackathon task

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import cost_function
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer


test_data_no_target = pd.read_csv('data/test_data_no_target.csv', sep=';', decimal=',')
training_data = pd.read_csv('data/training_data.csv', sep=';', decimal=',')


In [2]:
def xy_split(data):
    # splits data into features and target
    X = data.drop(columns=['Class','Perform'])  
    y = data['Class']
    return X, y

def encode(data):
    # label encodes target to be 0, 1, 2
    encoder = LabelEncoder()
    tdata = data.copy()
    tdata['Class'] = tdata[['Class']].apply(encoder.fit_transform)
    return {
        'enc': encoder,
        'data': tdata, 
           }

def decode(encoder, data):
    # returns python list decoding back to -1, 0, 1
    return encoder.inverse_transform(data.ravel()).tolist()

def dummies(data):
    # creates dummies from Group col
    d = data.copy()
    d = pd.concat((d.drop(columns=['Group']), pd.get_dummies(d['Group'])), axis=1)
    return d

def submit(path, predictions):
    # writes submition file
    with open(path, 'w') as f:
        for prediction in predictions.tolist():
            f.write(f"{prediction}\n")  # Writing each prediction on a new line
    
    print("Submission file created:", path)
    return True


def knn_imputer(data):
    imputer = KNNImputer(n_neighbors=2)
    data = imputer.fit_transform(data)
    return (imputer, data)

def target_eval(preds, test):
    return cost_function.evaluate_error(preds,test)


def pca_analysis(X_train):
    # PCA
    pca = PCA()
    X_pca = pca.fit_transform(X_train)
    
    # Cumulative variance explained by the PCA components
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # Plotting the cumulative variance
    plt.figure(figsize=(8, 4))
    plt.plot(cumulative_variance, marker='o')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('Cumulative Explained Variance by PCA Components')
    plt.grid(True)
    # Draw a line at 90% variance explained (change this threshold as needed)
    plt.axhline(y=0.90, color='r', linestyle='--')
    plt.text(0.5, 0.85, '90% cut-off threshold', color = 'red', fontsize=12)
    plt.show()

def pca(data):
    # Applying PCA
    pca = PCA(n_components=2)
    data = pca.fit_transform(data)
    return pca, data


In [3]:
# preprocess parameters:
preprocess_params = {
    "imputer": ["SimpleImputer", "KNNImputer"],
    "n_neighbors": [4, 6, 8, 10], # n_neighbours for knn imputer
    
}


In [4]:
encode_dct = encode(training_data)
data = encode_dct['data']
encoder = encode_dct['enc']

data = dummies(data)
X, y = xy_split(data)

In [5]:
X

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,G10,G11,G2,G3,G4,G5,G6,G7,G8,G9
0,0.136495,-0.028429,-0.037772,-0.232459,-0.016222,-0.187506,-0.322545,-0.043743,0.125389,-0.014757,...,0,0,0,0,0,0,0,0,0,1
1,-0.714522,-0.042137,-0.052968,-0.796862,-0.018394,0.070102,-0.076321,-0.063864,-1.045521,-0.037353,...,0,0,0,0,0,1,0,0,0,0
2,0.104791,-0.038188,-0.053191,0.620233,0.148587,0.489875,0.319274,-0.060246,0.053174,-0.025008,...,1,0,0,0,0,0,0,0,0,0
3,-0.532847,-0.006582,-0.023377,1.306702,-0.068909,0.048024,-0.119481,-0.021057,-1.012916,-0.011783,...,0,0,1,0,0,0,0,0,0,0
4,-0.200815,-0.016334,-0.036754,-0.886675,0.484495,-1.148744,0.152517,-0.043580,-0.935537,-0.023262,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-0.946477,-0.039767,-0.051916,,-9.099326,1.470616,1.470374,-0.045827,-1.323460,-0.030871,...,0,0,0,0,0,0,0,0,0,0
7996,-0.521183,-0.038272,-0.047929,-0.820632,-0.179414,-0.774646,0.413706,-0.051830,-0.378937,-0.034396,...,0,0,0,0,0,0,0,1,0,0
7997,0.323233,-0.031075,-0.045713,-0.105448,-0.103823,0.153551,-0.052157,-0.056057,-0.163512,-0.018169,...,0,0,1,0,0,0,0,0,0,0
7998,1.084416,-0.028497,-0.047528,-0.180118,-0.634162,0.143589,-0.033258,-0.058396,0.088713,-0.021483,...,0,0,0,0,1,0,0,0,0,0


In [39]:
cat_cols = []
bin_cols = []
num_cols = []
for col in X.columns.tolist():
    if len(X[col].value_counts()) > 2 and len(X[col].value_counts())<10:
        cat_cols.append(col)
    elif len(X[col].value_counts()) == 2:
        bin_cols.append(col)
    else:
        num_cols.append(col)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# we scale and impute numerical data
scaler = StandardScaler()
# imp_mean = SimpleImputer(strategy='mean')
imp_knn = KNNImputer(n_neighbors=2)
X_train[num_cols] = imp_knn.fit_transform(X_train[num_cols])
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

In [8]:
# Applying PCA
pca = PCA(n_components=2)
X_train = pca.fit_transform(X_train)


In [11]:
X_train

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,G10,G11,G2,G3,G4,G5,G6,G7,G8,G9
1467,0.732391,-0.049495,-0.108362,0.298748,-0.264023,0.287878,0.268980,-0.084291,0.710837,0.035030,...,0,0,0,0,0,0,1,0,0,0
5768,-1.210409,-0.178133,-0.111517,-0.984267,0.181695,-1.227170,-0.354102,-0.165851,-1.255166,-0.158515,...,0,0,0,0,0,0,0,1,0,0
5714,1.423492,0.193215,0.101814,1.978976,-0.126609,-0.101578,-0.362348,0.232707,1.894288,0.222632,...,0,0,0,0,0,0,0,0,0,1
1578,-0.526062,-0.132345,-0.132407,-0.257827,0.118047,-0.138916,-0.362079,-0.055179,0.868881,-0.098084,...,0,0,1,0,0,0,0,0,0,0
6958,-0.487524,-0.119949,-0.104892,-0.307236,-0.091554,-0.122655,-0.313843,-0.117239,-0.272218,-0.070974,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,-0.624135,-0.055321,-0.046128,0.266004,-0.182178,-0.393785,-0.685215,-0.011061,-0.546427,0.004759,...,0,0,0,0,0,0,1,0,0,0
5390,-0.419770,-0.068757,-0.124992,-0.869505,-0.114277,-0.375066,-0.636289,-0.129584,-0.866527,-0.050418,...,0,0,0,0,0,1,0,0,0,0
860,0.436979,-0.060034,-0.064724,0.538795,-0.160971,-0.296817,-0.512428,-0.090959,0.326753,-0.021847,...,0,0,0,0,0,1,0,0,0,0
7603,-0.787789,-0.135103,-0.121600,-0.908511,-0.089124,-0.654887,0.099507,-0.148883,-0.881887,-0.138928,...,0,0,0,0,0,0,0,1,0,0


In [None]:
plt.scatter(X_train[:,0], X_train[:,1], c=y_train)

In [None]:
plt.plot(x=y=np.cumsum(y_train))

In [None]:
def rf_classifier(X_train, y_train):
    parameters = {
        "min_samples_split": [30, 60, 90], # np.arange(2, 200),  # range(2, 200),
        "min_samples_leaf": [5, 7, 10, 20] # np.arange(1, 200),  # range(1, 200),
    }
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    grid_search = GridSearchCV(clf, parameters, scoring="f1_macro", cv=5)
    rf = grid_search.fit(X_train, y_train);
    return rf

def cb_classifier(X_train, y_train):
    model = CatBoostClassifier(
        iterations=1000, depth=6, learning_rate=0.1,
        loss_function='', verbose=False)

    PARAMS = {
        "n_estimators": [300, 500, 700, 1000],  # [5, 10, 20, 30, 40, 50, 70, 100, 150, 200, 250, 300, 500, 1000],
        "learning_rate": [0.05],  # [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.04, 0.05, 0.1, 0.2, 0.3, 0.5],
        "max_depth": [4],  # np.arange(4, 20, 1),
        "l2_leaf_reg": [10],  # np.arange(0.1, 1, 0.05),
        "subsample": [0.6],  # [3, 5, 7, 10],
    #     "random_strength": [2, 5, 8],  # [1, 2, 5, 10, 20, 50, 100],
        "min_data_in_leaf": [100],  # np.arange(10, 1001, 10),
    #     'task_type': ['GPU'],
    #     'early_stopping_rounds': [10],
    }
    model.grid_search(PARAMS, X_train, y_train, cv=5, plot=True, refit=True)
    return model
    

def knn_classifier(X_train, y_train):
    neigh = KNeighborsClassifier()
    parameters = {'n_neighbors': [i for i in range(5, 15)],
                 'metric': ['manhattan'],
                 }
    
    grid_search = GridSearchCV(estimator=neigh, param_grid=parameters, cv=10, refit=True, scoring="f1_macro", n_jobs=-1,)
    model = grid_search.fit(X_train, y_train)
    return model

def log_reg(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model
    


In [None]:
model = rf_classifier(X_train, y_train)

In [None]:
model.best_estimator_

In [None]:
X_test[num_cols] = imp_mean.transform(X_test[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
X_test = pca.transform(X_test)

preds = model.predict(X_test)


In [None]:
print(cost_function.evaluate_error(preds,y_test))

Submit

In [46]:
test = test_data_no_target.copy()

In [16]:
test = dummies(test)
test[num_cols] = imp_mean.transform(test[num_cols])
test[num_cols] = scaler.transform(test[num_cols])
# test = pca.transform(test)


In [None]:
sub_preds = model.predict(test)
sub_preds = encoder.inverse_transform(sub_preds)
submit('submition_rf_pca.txt', sub_preds)

FNN approach

In [42]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0)  # ensure reproducibility
np.random.seed(0)
BATCH_SIZE = 32

In [90]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

# Assuming you have preprocessed X_train and y_train as numpy arrays or torch tensors

# Convert your data into PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Assuming classification, adjust dtype if needed

# Create a TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)

# Set batch size
batch_size = 64

# Create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define your neural network model
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.fc4(x)
        return x

# Define your model, loss function, and optimizer
input_size = X_train.shape[1]  # Assuming X_train is a 2D array
hidden_size = 128  # You can adjust this as needed
num_classes = len(set(y_train))  # Assuming y_train contains class labels
model = MyModel(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10  # You can adjust this as needed
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')


Epoch [1/10], Loss: 1.0165
Epoch [2/10], Loss: 0.9903
Epoch [3/10], Loss: 0.9742
Epoch [4/10], Loss: 0.9591
Epoch [5/10], Loss: 0.9447
Epoch [6/10], Loss: 0.9253
Epoch [7/10], Loss: 0.9062
Epoch [8/10], Loss: 0.8820
Epoch [9/10], Loss: 0.8526
Epoch [10/10], Loss: 0.8210


In [71]:
y_train

1467    2
5768    0
5714    2
1578    2
6958    1
       ..
5226    2
5390    0
860     0
7603    0
7270    2
Name: Class, Length: 6400, dtype: int64

In [47]:
test = dummies(test)
# test[num_cols] = imp_mean.transform(test[num_cols])
test[num_cols] = imp_knn.transform(test[num_cols])

test[num_cols] = scaler.transform(test[num_cols])

In [48]:
torch.tensor(test.values, dtype=torch.float32)

tensor([[-0.0460, -0.1137, -0.0957,  ...,  0.0000,  0.0000,  0.0000],
        [-0.6301, -0.1416, -0.1121,  ...,  0.0000,  0.0000,  1.0000],
        [-0.9086, -0.1349, -0.0869,  ...,  1.0000,  0.0000,  0.0000],
        ...,
        [ 0.6406, -0.0062, -0.0698,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0658, -0.0960, -0.1180,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.6712, -0.0477, -0.0679,  ...,  0.0000,  1.0000,  0.0000]])

In [49]:
probas = model(torch.tensor(test.values, dtype=torch.float32))


In [50]:
# sub_preds = encoder.inverse_transform(sub_preds)
probas

tensor([[  3.4681,  -3.4596,  -2.8909],
        [  2.4720,  -6.5256,   0.3184],
        [  6.3748, -12.7719,  -0.1659],
        ...,
        [ -3.4082,  -5.5129,   5.1767],
        [ -1.4690,  -1.7659,   0.6182],
        [  1.2344,  -3.2272,  -0.5398]], grad_fn=<AddmmBackward0>)

In [91]:
# Convert your test data into PyTorch tensor
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

# Set your model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)

# Convert predicted tensor to numpy array
predicted_labels = predicted.numpy()

# evaluate train and test


with torch.no_grad():
    train_outputs = model(X_train_tensor)
    test_outputs = model(X_test_tensor)
    _, train_preds = torch.max(train_outputs, 1)
    _, test_preds = torch.max(test_outputs, 1)
    
print(f"Train error: {target_eval(train_preds, y_train)}")
print(f"Test error: {target_eval(test_preds, y_test)}")


Train error: 0.55734375
Test error: 0.985625


In [94]:
target_eval([0]*1600, y_test)

1.0875

In [80]:
y_test.shape

(1600,)

In [85]:
X_test_tensor.shape

torch.Size([2000, 127])

In [82]:
test_outputs.shape

torch.Size([2000, 3])

In [39]:
predicted_labels

array([0, 2, 0, ..., 0, 2, 0])

In [40]:
sub_preds = encoder.inverse_transform(predicted_labels)

In [41]:
sub_preds

array([-1,  1, -1, ..., -1,  1, -1])

In [42]:
submit('submition_fnn_4l_128p.txt', sub_preds)

Submission file created: submition_fnn_4l_128p.txt


True

In [52]:
m = nn.Softmax()

In [53]:
q = m(probas)
q

  q = m(probas)


tensor([[9.9730e-01, 9.7753e-04, 1.7264e-03],
        [8.9591e-01, 1.1083e-04, 1.0398e-01],
        [9.9856e-01, 4.8318e-09, 1.4415e-03],
        ...,
        [1.8686e-04, 2.2775e-05, 9.9979e-01],
        [1.0198e-01, 7.5785e-02, 8.2223e-01],
        [8.4662e-01, 9.7741e-03, 1.4361e-01]], grad_fn=<SoftmaxBackward0>)

In [54]:
sum(q[:, 2] > 0.7)
sum(q[:, 0] > 0.7) 

tensor(882)

In [64]:
res = torch.zeros(2000, 1)
res[q[:, 0] > 0.7] = -1
res[q[:, 2] > 0.7] = 1


In [60]:
res.shape

torch.Size([2000, 1])

In [65]:
res = res.reshape(2000).numpy()

In [66]:
res = res.astype(np.int32)
res

array([-1, -1, -1, ...,  1,  1, -1], dtype=int32)

In [68]:
submit('submition_fnn_4l_128p_0_7_margin_knnimputer.txt', res)

Submission file created: submition_fnn_4l_128p_0_7_margin_knnimputer.txt


True