In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model building
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.neighbors import KNeighborsClassifier

# Some metrics
from sklearn.metrics import log_loss

# Model persistence
from joblib import load, dump

In [2]:
# Loading
training_data = pd.read_csv("data_files/training_data_preprocessed.csv")
test_data = pd.read_csv("data_files/test_data_preprocessed.csv")

training_data.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,cyclindependent kinases cdks regulate variety ...
1,1,CBL,W802*,2,abstract background nonsmall cell lung cancer ...
2,2,CBL,Q249E,2,abstract background nonsmall cell lung cancer ...
3,3,CBL,N454D,3,recent evidence demonstrated acquired uniparen...
4,4,CBL,L399V,4,oncogenic mutations monomeric casitas blineage...


In [3]:
# Text vectorization
vectorizer = TfidfVectorizer().fit(training_data["Text"])

# Saving the vectorizer
dump(vectorizer, "vectorizer.joblib")

X = vectorizer.transform(training_data["Text"])
y = training_data["Class"]

In [4]:
# Splitting in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
X_test.shape[0]

665

## Building a simple KNN Model

In [5]:
neighbors = [10, 30, 50, 70, 90, 120, 150, 200, 350, 400, 500, 1000]
log_losses = []

for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    y_pred = knn.predict_proba(X_test)
    log_losses.append(log_loss(y_test, y_pred))
    print(f"Log loss for {neighbor} neighbors: {log_loss(y_test, y_pred)}")

Log loss for 10 neighbors: 3.677958470293884
Log loss for 30 neighbors: 2.1489187639755594
Log loss for 50 neighbors: 1.6089980148144991
Log loss for 70 neighbors: 1.5250435318667854
Log loss for 90 neighbors: 1.532304730673979
Log loss for 120 neighbors: 1.4619322509482782
Log loss for 150 neighbors: 1.4717046690754092
Log loss for 200 neighbors: 1.5312015041174094
Log loss for 350 neighbors: 1.6233686727533079
Log loss for 400 neighbors: 1.6437478879709413
Log loss for 500 neighbors: 1.6718518901158872
Log loss for 1000 neighbors: 1.764285884781914


In [6]:
best_k = neighbors[np.argmin(log_losses)]
best_k

120

In [7]:
# Building a deployable model
modelKnn = KNeighborsClassifier(n_neighbors=best_k).fit(X_train, y_train)

In [11]:
modelKnn.predict(X_test[0])

array([7])

In [None]:

# Deploying it
dump(modelKnn, "modelo_knn.joblib")

## Building a Deep Learning Model

In [35]:
# Creating the custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [36]:
# Creating the custom Datasets
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

# Creating the data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [37]:
# Creating the simple model with 1 hidden layer
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.l1 = nn.Linear(input_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out

In [42]:
# Setting the training function
def train(model, train_loader, optimizer, criterion):
    model.train()
    train_loss = 0
    total_accuracy = 0
    total_count = 0

    for X, y in train_loader:
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        total_accuracy += (output.argmax(1) == y).sum().item()
        train_loss += loss.item()
        total_count += len(y)
    return total_accuracy / total_count

In [43]:
# Setting the test function
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in test_loader:
            output = model(X)
            loss = criterion(output, y)
            test_loss += loss.item()
    return test_loss / len(test_loader)

In [49]:
# Training the model within a loop of epochs
input_dim = X_train.shape[1]
hidden_dim = 64
output_dim = 9

model = TextClassifier(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)
criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

epochs = 100
for epoch in range(epochs):
    train_accuracy = train(model, train_loader, optimizer, criterion)
    test_loss = test(model, test_loader, criterion)
    if epoch % 10 == 0:
        print('-'*20)
        print(f"Epoch: {epoch+1} | Train Accuracy: {train_accuracy:.4f} | Test LogLoss: {test_loss:.4f}")
        print('-'*20)

--------------------
Epoch: 1 | Train Accuracy: 0.1973 | Test LogLoss: -0.1562
--------------------
--------------------
Epoch: 11 | Train Accuracy: 0.2944 | Test LogLoss: -41.6861
--------------------
--------------------
Epoch: 21 | Train Accuracy: 0.2944 | Test LogLoss: -53606.0696
--------------------
--------------------
Epoch: 31 | Train Accuracy: 0.2944 | Test LogLoss: -71774523.2381
--------------------
--------------------
Epoch: 41 | Train Accuracy: 0.2944 | Test LogLoss: -96106675053.7143
--------------------


KeyboardInterrupt: 