<a href="https://colab.research.google.com/github/Andrian0s/ML4NLP1-2023-Tutorial-Notebooks/blob/main/04_tutorial_skorch_todos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML4NLP1
## Tutorial 04: sklearn: Pipeline and skorch

# Installing skorch and loading libraries

In [None]:
import subprocess

# Installation on Google Colab
try:
    import google.colab
    subprocess.run(['python', '-m', 'pip', 'install', 'skorch'])
except ImportError:
    pass

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

In [None]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [None]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

## Training a classifier and making predictions

In [None]:
# download dataset
!gdown 1IUyw6n3IrabkhPQH7Jkuw_Vml4UXwiCI # y_train
!gdown 1IVeWdIaO5tXPeMOOsHPjqd1K3Obp4tC3 # y_test
!gdown 1ITAYFokSjAxaIHh6bzv5id-phSqx78q9 # X_train
!gdown 1IZrB2mCAmkly0w5pdWhUlMqJdxBC8OtK # X_test

In [None]:
X_train = np.load('X_train.npy', allow_pickle=True)
X_test = np.load('X_test.npy', allow_pickle=True)
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')


In [None]:
X_train.shape

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC

In [None]:
class CountVectorizerWrapper:
    def __init__(self, ngram_range, max_features):
        print('args:', str([ngram_range, max_features]))
        self.countvec = CountVectorizer(ngram_range=ngram_range, max_features=max_features)

    def fit(self, X, y=None):
        self.countvec.fit(X)
        return self

    def transform(self, X, y=None):
        return self.countvec.transform(X).astype(np.float32)

In [None]:
# Preprocessing
label_encoder = LabelEncoder()
le_fitted = label_encoder.fit(y_train)

In [None]:
le_fitted.classes_

In [None]:
print(len(le_fitted.classes_))

In [None]:
y_train = le_fitted.transform(y_train)
y_test = le_fitted.transform(y_test)

In [None]:
y_train_int = y_train.astype(np.int64)

In the following, we define a vanilla neural network with two hidden layers. The output layer should have as many outputs as there are classes. In addition, it should have a nonlinearity function.

In [None]:
class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=2500,
            nonlin=F.relu,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(5000, num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 200)
        self.output = nn.Linear(200, 20)

    def forward(self, X, **kwargs):
      X = self.nonlin(self.dense0(X))
      X = F.relu(self.dense1(X))
      X = self.output(X)
      return X.squeeze(dim=1)

In [None]:
net = NeuralNetClassifier(
    ClassifierModule,
    max_epochs=10,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    device='cuda',  # comment this to train with CPU
)

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
vec_args = dict(ngram_range=(2, 2), max_features=5000)
pipe = Pipeline(steps=[
        ('Vectorizer', CountVectorizerWrapper(**vec_args)),
        ('net', net)
    ], verbose=True)

In [None]:
pipe.fit(X_train, y_train_int)

In [None]:
from sklearn.model_selection import GridSearchCV

# deactivate skorch-internal train-valid split and verbose logging
net.set_params(train_split=False, verbose=0)
params = {
    'net__lr': [0.1, 0.01],
}

In [None]:
vec_args = dict(ngram_range=(2, 2), max_features=5000)
pipe = Pipeline(steps=[
        ('Vectorizer', CountVectorizerWrapper(**vec_args)),
        ('net', net)
    ], verbose=True)

In [None]:
grid_net = GridSearchCV(pipe, params, refit=False, cv=2, scoring='accuracy')

In [None]:
grid_net.fit(X_train, y_train)

In [None]:
print(grid_net.best_score_, grid_net.best_params_)