In [None]:
import pandas as pd
import numpy as np
import csv
import re
import string
from collections import defaultdict

In [None]:
# download dataset
!gdown 1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs # x_train
!gdown 1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6 # x_test
!gdown 1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl # y_train
!gdown 1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X # y_test

Downloading...
From: https://drive.google.com/uc?id=1QP6YuwdKFNUPpvhOaAcvv2Pcp4JMbIRs
To: /content/x_train.txt
100% 64.1M/64.1M [00:01<00:00, 46.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QVo7PZAdiZKzifK8kwhEr_umosiDCUx6
To: /content/x_test.txt
100% 65.2M/65.2M [00:00<00:00, 117MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QbBeKcmG2ZyAEFB3AKGTgSWQ1YEMn2jl
To: /content/y_train.txt
100% 480k/480k [00:00<00:00, 70.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QaZj6bI7_78ymnN8IpSk4gVvg-C9fA6X
To: /content/y_test.txt
100% 480k/480k [00:00<00:00, 118MB/s]


In [None]:
with open(f'x_train.txt') as f:
    x_train = f.read().splitlines()
with open(f'y_train.txt') as f:
    y_train = f.read().splitlines()
with open(f'x_test.txt') as f:
    x_test = f.read().splitlines()
with open(f'y_test.txt') as f:
    y_test = f.read().splitlines()

In [None]:
# combine x_train and y_train into one dataframe
train_df = pd.DataFrame({'text': x_train, 'label': y_train})
# write train_df to csv with tab as separator
train_df.to_csv('train_df.csv', index=False, sep='\t')
# comibne x_test and y_test into one dataframe
test_df = pd.DataFrame({'text': x_test, 'label': y_test})

In [None]:
train_df.head()

Unnamed: 0,text,label
0,Klement Gottwaldi surnukeha palsameeriti ning ...,est
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
2,भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...,mai
3,"Après lo cort periòde d'establiment a Basilèa,...",oci
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...,tha


In [None]:
# get list of all labels
labels = train_df['label'].unique().tolist()
print(labels)

['est', 'swe', 'mai', 'oci', 'tha', 'orm', 'lim', 'guj', 'pnb', 'zea', 'krc', 'hat', 'pcd', 'tam', 'vie', 'pan', 'szl', 'ckb', 'fur', 'wuu', 'arz', 'ton', 'eus', 'map-bms', 'glk', 'nld', 'bod', 'jpn', 'arg', 'srd', 'ext', 'sin', 'kur', 'che', 'tuk', 'pag', 'tur', 'als', 'koi', 'lat', 'urd', 'tat', 'bxr', 'ind', 'kir', 'zh-yue', 'dan', 'por', 'fra', 'ori', 'nob', 'jbo', 'kok', 'amh', 'khm', 'hbs', 'slv', 'bos', 'tet', 'zho', 'kor', 'sah', 'rup', 'ast', 'wol', 'bul', 'gla', 'msa', 'crh', 'lug', 'sun', 'bre', 'mon', 'nep', 'ibo', 'cdo', 'asm', 'grn', 'hin', 'mar', 'lin', 'ile', 'lmo', 'mya', 'ilo', 'csb', 'tyv', 'gle', 'nan', 'jam', 'scn', 'be-tarask', 'diq', 'cor', 'fao', 'mlg', 'yid', 'sme', 'spa', 'kbd', 'udm', 'isl', 'ksh', 'san', 'aze', 'nap', 'dsb', 'pam', 'cym', 'srp', 'stq', 'tel', 'swa', 'vls', 'mzn', 'bel', 'lad', 'ina', 'ava', 'lao', 'min', 'ita', 'nds-nl', 'oss', 'kab', 'pus', 'fin', 'snd', 'kaa', 'fas', 'cbk', 'cat', 'nci', 'mhr', 'roa-tara', 'frp', 'ron', 'new', 'bar', 'ltg'

In [None]:
# T: Have a quick peek at the training data, looking at a couple of texts from different languages. Do you notice anything that might be challenging for the classification?

In [None]:
# Peek at some examples from the training data
print(train_df.sample(10))


                                                    text label
27066  Wan sidiria yari ben 365.256363051 dey. Wan tr...   srn
84278  ﺋﻪﺭﻣﯩﻴﺎ ﺋﯧﻠﻰ ﺳﺎﻳﺮﺍﻣﻰ (ﻧﯩﻤﺸﯧﮭﯩﺖ) ﺋﯘﻳﻐﯘﺭ ﮪﺎﺯﯨﺮﻗﻰ...   uig
31911  A very popular destination is Mendenhall Glaci...   pam
5693   Par di mondi ed zänt al têrmin fundamentalîsum...   egl
88279  የግስበት ሜዳ ደንታ የለሽ ሜዳ ነው። ከአንድ ነጥብ እስከ ሌላ ነጥብ በግ...   amh
82341  Ci Tuubaa la jumaa ju mag ji nekk, doon jumaa ...   wol
25350  Ny isam-ponin'i Ohis dia 308 mponina araka ny ...   mlg
23706  Terrängen runt Tall Birāk är mycket platt. Den...   swe
95441  Tataristan (Tatar: Татарстан, Татарстан Респуб...   gag
31199  Situada dins del nucli antic de la població de...   cat


Challenges Noticed:
Non-Latin Scripts: Texts in scripts like Arabic, Amharic, etc.
Similar Scripts: Languages like Swedish and Danish using similar scripts.
Special Characters: Presence of special characters and diacritics.
Short Text Lengths: Some texts may be too short to provide enough context.
Code-Switching: Potential mixing of multiple languages in a single text.

In [None]:
# T: How many instances per label are there in the training and test set? Do you think this is a balanced dataset? Do you think the train/test split is appropriate? If not, please rearrange the data in a more appropriate way.

In [None]:
# Count instances per label in the training set
train_label_counts = train_df['label'].value_counts()

# Count instances per label in the test set
test_label_counts = test_df['label'].value_counts()

print("Training label counts:\n", train_label_counts)
print("Test label counts:\n", test_label_counts)


Training label counts:
 label
est    500
eng    500
vep    500
sgs    500
uig    500
      ... 
lmo    500
mya    500
ilo    500
csb    500
ltz    500
Name: count, Length: 235, dtype: int64
Test label counts:
 label
mwl    500
uig    500
tat    500
nno    500
new    500
      ... 
frp    500
krc    500
mlg    500
msa    500
ckb    500
Name: count, Length: 235, dtype: int64


The dataset contains 500 instances per label for both the training and test sets, indicating that each label is equally represented in both sets. This is evident from the following output:

- Training Label Counts:
Each label has 500 instances.
Total number of unique labels: 235

- Test Label Counts:
Each label has 500 instances.
Total number of unique labels: 235

- =>The dataset is balanced since each label has an equal number of instances in both the training and test sets.
- => Appropriate Train/Test Split: The train/test split appears appropriate because each label is equally represented in both sets.

In [None]:
# T: Get a subset of the train/test data that includes English, German, Dutch, Danish, Swedish and Norwegian, plus 20 additional languages of your choice (the labels can be found in the file labels.csv)

In [None]:

# Predefined set of mandatory language labels
given_labels = set(['eng', 'deu', 'nld', 'dan', 'swe', 'nno', 'jpn'])

# Get all unique labels present in the training dataset
all_labels = set(train_df['label'].unique())

# Randomly select 20 additional labels from the set of all labels, excluding the given mandatory labels
target_labels = list(np.random.choice(list(all_labels - given_labels), 20, replace=False))

# Combine the mandatory labels with the randomly selected additional labels
target_labels += given_labels

# Filter the training dataframe to include only the rows with labels in the target_labels list
train_df = train_df[train_df['label'].isin(target_labels)]

# Filter the test dataframe to include only the rows with labels in the target_labels list
test_df = test_df[test_df['label'].isin(target_labels)]

# Print the size of the filtered training and test datasets
print("Filtered training data size:", train_df.shape)
print("Filtered test data size:", test_df.shape)


Filtered training data size: (13500, 2)
Filtered test data size: (13500, 2)


In [None]:
train_df.head(20)

Unnamed: 0,text,label
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe
9,Vanwehe zen Gentsen ofkomst wor 't een ok wè G...,zea
11,6 fevrye 1996 : Gouvènman ayisyen reprann kont...,hat
13,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,tam
16,"Wjelganocno Wyspa (szpań. Isla de Pascua, Terr...",szl
26,De spons behoort tot het geslacht Haliclona en...,nld
29,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...,jpn
35,"Кхузахь климат барамехь континентан ю, аьхка й...",che
46,シャーリー・フィールドは、サン・ベルナルド・アベニュー沿い市民センターとR&Tマーティン高校...,jpn
52,Indtil 1545 havde flådecheferne kunnet hyre et...,dan


In [None]:
# T: With the following code, we wanted to encode the labels, however, our cat was walking on the keyboard and some of it got changed. Can you fix it?
from sklearn.preprocessing import LabelEncoder
le_fitted = LabelEncoder().fit(train_df['label'])
y_train_dev, y_test = le_fitted.fit(train_df['label']), le_fitted.fit(test_df['label'])

This is how we fix it

In [None]:
from sklearn.preprocessing import LabelEncoder
le_fitted = LabelEncoder().fit(train_df['label'])

In [None]:
le_fitted.classes_

array(['afr', 'che', 'cos', 'cym', 'dan', 'deu', 'egl', 'eng', 'fin',
       'frp', 'hat', 'hin', 'jpn', 'kin', 'mar', 'nap', 'nld', 'nno',
       'olo', 'por', 'srp', 'swa', 'swe', 'szl', 'tam', 'wol', 'zea'],
      dtype=object)

In [None]:
y_train_dev, y_test = le_fitted.transform(train_df['label']), le_fitted.transform(test_df['label'])

Create a suitable pipeline in sklearn to preprocess the data. Think about extending the feature space.
What other features could you use to determine the language? Please include additional linguistic
features to your machine learning model for this task.
1. Train the following classifier: LogisticRegression
2. To find the optimal hyperparameter settings for the classifier, use sklearn’s GridSearchCV. [hint: don’t
overdo it at the beginning, since runtime might go up fast] You are supposed to experiment with the following hyperparameters:
- Penalty (Regularization)
- Solver
- Experiment with parameters of the Vectorizer (not required, highly advised)

In [None]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Custom transformer for normalizing text
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def _normalize_text(self, text):
        """Remove punctuation, replace newlines, and lowercase the text.

        :param text: string
        :return: normalized string
        """
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = re.sub(r'\n', r'', text)  # Remove newlines
        # text = text.lower()  # Optionally lowercase the text
        return text

    def transform(self, X, y=None):
        """Apply normalization to each text in the input array.

        :param X: array-like of strings
        :return: numpy array of normalized strings
        """
        texts = [self._normalize_text(text) for text in X]
        return np.array(texts)

# Custom transformer for extracting additional linguistic features
class FeatureExtractor(BaseEstimator, TransformerMixin):
    vowels = set('aeiouäöüàéèëï')
    consonants = set('bcdfghklmnlpqrstvwxyz')

    def __init__(self):
        self.scaler = MinMaxScaler()

    def _to_bigrams(self, text):
        """Generate bigrams from the input text.

        :param text: string
        :return: list of bigrams
        """
        return [bg[0] + bg[1] for bg in zip(text, text[1:])]

    def _get_vowel_consonant_ratio(self, text):
        """Calculate the vowel to consonant ratio in the text.

        :param text: string
        :return: float ratio of vowels to consonants
        """
        vf, cf = 0, 0
        for c in text.lower():
            if c in self.vowels:
                vf += 1
            elif c in self.consonants:
                cf += 1
        return vf / (cf + 1)

    def _get_capitalization_ratio(self, text):
        """Calculate the ratio of uppercase to total characters in the text.

        :param text: string
        :return: float ratio of uppercase characters
        """
        up_count = sum(1 for c in text if c.isupper())
        return up_count / (len(text) + 1)

    def _get_double_char_freq(self, text):
        """Calculate the frequency of double characters in the text.

        :param text: string
        :return: int frequency of double characters
        """
        return sum(1 for bg in self._to_bigrams(text) if bg[0] == bg[1])

    def _extract_num_features(self, texts):
        """Extract numerical features from a list of texts.

        :param texts: list of strings
        :return: numpy array of numerical features
        """
        num_features = []
        for text in texts:
            features = [
                self._get_vowel_consonant_ratio(text),
                self._get_capitalization_ratio(text),
                self._get_double_char_freq(text)
            ]
            num_features.append(features)
        return np.array(num_features)

    def fit(self, X, y=None):
        """Fit the scaler on the extracted numerical features.

        :param X: array-like of strings
        :return: self
        """
        numerical_features = self._extract_num_features(X)
        self.scaler.fit(numerical_features)
        return self

    def transform(self, X, y=None):
        """Transform the input texts into scaled numerical features.

        :param X: array-like of strings
        :return: tuple (original texts, scaled numerical features)
        """
        numerical_features = self._extract_num_features(X)
        return X, self.scaler.transform(numerical_features)

# Wrapper to combine vectorizer and additional features
class VectorizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.countvec = TfidfVectorizer(ngram_range=(1, 2), max_features=500)

    def fit(self, X, y=None):
        """Fit the vectorizer on the text data.

        :param X: tuple (texts, numerical features)
        :return: self
        """
        texts, _ = X
        self.countvec.fit(texts)
        return self

    def transform(self, X, y=None):
        """Transform the texts into TF-IDF features and combine with numerical features.

        :param X: tuple (texts, numerical features)
        :return: tuple (TF-IDF features, numerical features)
        """
        texts, numerical_features = X
        return self.countvec.transform(texts), numerical_features

# Transformer to convert sparse matrix to dense array
class MatrixToArrayConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Convert sparse matrices to dense arrays.

        :param X: tuple (sparse TF-IDF matrix, numerical features)
        :return: tuple (dense TF-IDF array, numerical features)
        """
        return X[0].toarray(), X[1]

# Transformer to unify text and numerical features into a single array
class MatrixUnifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Concatenate text features and numerical features into a single array.

        :param X: tuple (TF-IDF features, numerical features)
        :return: numpy array of combined features
        """
        return np.concatenate([X[0], X[1]], axis=1)



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define the hyperparameter grid for GridSearchCV
clf_param_grid = {
    'CLF__penalty': ['l2', 'l1'],  # Regularization types
    'CLF__solver': ['liblinear', 'saga'],  # Solvers for optimization
    'CLF__max_iter': [50, 100]  # Maximum number of iterations
}

# Create the pipeline with custom transformers and logistic regression
pipe = Pipeline(steps=[
    ('TextNormalizer', TextNormalizer()),  # Normalize the text
    ('FeatureExtractor', FeatureExtractor()),  # Extract additional linguistic features
    ('Vectorizer', VectorizerWrapper()),  # Vectorize text and combine with additional features
    ('MatrixToArrayConverter', MatrixToArrayConverter()),  # Convert sparse matrix to dense array
    ('MatrixUnifier', MatrixUnifier()),  # Unify text and numerical features
    ('CLF', LogisticRegression())  # Logistic Regression classifier
], verbose=True)

# Set up GridSearchCV for hyperparameter tuning
grid = GridSearchCV(pipe, n_jobs=1, param_grid=clf_param_grid, scoring='f1_micro', cv=2)
grid.fit(train_df['text'].to_numpy(), y_train_encoded)

# Best model from GridSearchCV
best_model = grid.best_estimator_
print("Best hyperparameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

# Predict on the test set using the best model
y_pred = best_model.predict(test_df['text'].to_numpy())



[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   1.5s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=  11.1s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   4.3s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s
[Pipeline] ............... (step 6 of 6) Processing CLF, total=   0.6s
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.4s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   2.6s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   4.6s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s
[Pipeline] ............... (step 6 of 6) Processing CLF, total=   1.0s
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.4s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   2.7



[Pipeline] ............... (step 6 of 6) Processing CLF, total=  45.2s
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.4s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   2.8s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   5.0s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s




[Pipeline] ............... (step 6 of 6) Processing CLF, total=  44.5s
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.4s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   2.7s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   3.6s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s
[Pipeline] ............... (step 6 of 6) Processing CLF, total=   0.8s
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.4s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   2.6s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   3.4s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s
[Pipeline] ............... (step 6 of 6) Processing CLF, total=   0.6s
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.6



[Pipeline] ............... (step 6 of 6) Processing CLF, total= 1.7min
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   1.1s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   4.5s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   3.4s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s




[Pipeline] ............... (step 6 of 6) Processing CLF, total= 1.5min
[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   0.7s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=   7.6s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=   7.0s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.1s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s
[Pipeline] ............... (step 6 of 6) Processing CLF, total=  27.1s
Best hyperparameters: {'CLF__max_iter': 50, 'CLF__penalty': 'l2', 'CLF__solver': 'saga'}
Best cross-validation score: 0.9355555555555556


NameError: name 'confusion_matrix' is not defined

Now that you have your best model, it’s time to dive deep into understanding how the model makes predictions.
It is important that we can explain and visualize our models to improve task performance. Explainable models help
characterize model fairness, transparency, and outcomes. Let's try to understand what our best-performing logistic
regression classification model has learned. Generate a feature importance table for the top ten features
(please have the features named) for the languages English, Swedish, Norwegian, and Japanese. What is
more important, extra features or the outputs of the vectorizer, discuss.
We recommend using the ELI5 library as it supports sklearn pipelines to explain the model weights. For more
details, see their documentation on dealing with text classification. We will accept answers from any explanation
library/method as long as the explanations for the model weights are provided in a structured/clear way.

In [None]:
grid_results = pd.DataFrame.from_dict(grid.cv_results_)
grid_results.sort_values(by=["rank_test_score"])


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_CLF__max_iter,param_CLF__penalty,param_CLF__solver,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
1,19.761982,0.159903,2.549926,0.013322,50,l2,saga,"{'CLF__max_iter': 50, 'CLF__penalty': 'l2', 'C...",0.936,0.935111,0.935556,0.000444,1
5,23.785404,2.40618,2.515764,0.012944,100,l2,saga,"{'CLF__max_iter': 100, 'CLF__penalty': 'l2', '...",0.936,0.935111,0.935556,0.000444,1
0,13.110469,4.448883,2.524133,0.015583,50,l2,liblinear,"{'CLF__max_iter': 50, 'CLF__penalty': 'l2', 'C...",0.932444,0.933926,0.933185,0.000741,3
4,7.307058,0.236991,3.324959,0.582958,100,l2,liblinear,"{'CLF__max_iter': 100, 'CLF__penalty': 'l2', '...",0.932444,0.933926,0.933185,0.000741,3
7,102.216493,6.050984,4.207169,1.68629,100,l1,saga,"{'CLF__max_iter': 100, 'CLF__penalty': 'l1', '...",0.928889,0.927259,0.928074,0.000815,5
3,53.405405,0.677958,2.524603,0.012125,50,l1,saga,"{'CLF__max_iter': 50, 'CLF__penalty': 'l1', 'C...",0.928593,0.927259,0.927926,0.000667,6
6,8.861101,1.021898,4.245603,0.841672,100,l1,liblinear,"{'CLF__max_iter': 100, 'CLF__penalty': 'l1', '...",0.928593,0.926519,0.927556,0.001037,7
2,6.954695,0.113156,3.448572,0.948559,50,l1,liblinear,"{'CLF__max_iter': 50, 'CLF__penalty': 'l1', 'C...",0.928593,0.92637,0.927481,0.001111,8


In [None]:
from sklearn.metrics import f1_score

# Predict on the test set using the best model
preds = grid.best_estimator_.predict(test_df['text'].to_numpy())

# Calculate F1 scores
f1_micro = f1_score(preds, y_test_encoded, average='micro')
f1_macro = f1_score(preds, y_test_encoded, average='macro')
print(f'F1-micro-score on the test set: {f1_micro}')
print(f'F1-macro-score on the test set: {f1_macro}')

F1-micro-score on the test set: 0.9420740740740741
F1-macro-score on the test set: 0.9434272619321705


In [None]:
# Define the number of classes
num_classes = len(le_fitted.classes_)

# Function to create a confusion matrix manually
def create_confusion_matrix(num_classes, preds, y_test):
    """Create confusion matrix 'by hand' since test set does not contain all labels."""
    df = pd.DataFrame(np.zeros((num_classes, num_classes), dtype=int))
    for i, j in zip(preds, y_test):
        df.iloc[i, j] += 1
    df.columns = le_fitted.classes_
    df.index = le_fitted.classes_
    return df

# Create confusion matrix
confusion_df = create_confusion_matrix(num_classes, preds, y_test_encoded)
print(confusion_df)

# Identify the best model for interpretation
model_to_interpret = grid.best_estimator_

     afr  che  cos  cym  dan  deu  egl  eng  fin  frp  ...  nno  olo  por  \
afr  495    0    0    0    0    1    0    0    0    0  ...    0    0    0   
che    1  469    0    0    0    1    0    0    0    1  ...    0    2    0   
cos    0    0  480    0    1    1    4    1    0    3  ...    0    0    0   
cym    0    0    0  490    0    0    0    0    0    0  ...    0    0    0   
dan    0    0    0    0  473    0    0    1    0    0  ...   12    0    0   
deu    0    0    1    0    6  485    0    0    1    1  ...    0    0    4   
egl    0    0    3    0    0    0  482    0    0    1  ...    0    0    0   
eng    1    0    0    5    2    2    0  485    2    1  ...    0    2   21   
fin    0    0    0    0    0    0    0    0  436    0  ...    0    7    0   
frp    0    0    7    0    1    5    2    0    0  482  ...    1    0    1   
hat    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
hin    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

In [None]:
!pip install eli5


Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107720 sha256=38d5b6228333dfcc39aa282de0567c3ea19cc6a066a428584fa1cfcf4b1f2728
  Stored in directory: /root/.cache/pip/wheels/b8/58/ef/2cf4c306898c2338d51540e0922c8e0d6028e07007085c0004
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0


In [None]:
import eli5
from eli5 import show_weights
# Extract the logistic regression model from the pipeline
lr_model = model_to_interpret.named_steps['CLF']

# Get the vectorizer model from the pipeline
vec_model = model_to_interpret.named_steps['Vectorizer'].countvec

# Define the target indices for specific languages
target_indices = [np.where(le_fitted.classes_ == 'eng')[0][0],
                  np.where(le_fitted.classes_ == 'jpn')[0][0],
                  np.where(le_fitted.classes_ == 'swe')[0][0]]

# Create feature names including the additional features
make_new_feature_names = np.concatenate([
    vec_model.get_feature_names_out(),
    np.array(["extra_feature_" + str(i) for i in range(3)])
], axis=-1)

# Display the model weights using ELI5 for specified target languages
show_weights(lr_model, top=(10, 10), feature_names=make_new_feature_names,
             target_names=le_fitted.classes_, targets=['eng', 'swe', 'nno'])


Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+3.902,the,
+3.632,was,
+2.838,he,
+2.773,in,
+2.533,with,
+2.308,and,
+2.281,is,
+2.183,to,
+2.133,by,
+2.119,as,

Weight?,Feature
+3.902,the
+3.632,was
+2.838,he
+2.773,in
+2.533,with
+2.308,and
+2.281,is
+2.183,to
+2.133,by
+2.119,as

Weight?,Feature
+6.179,och
+5.740,är
+3.456,av
+2.863,den
+2.411,på
+2.292,med
+2.284,som
+2.120,under
+2.103,har
+1.917,en

Weight?,Feature
+5.033,og
+4.407,av
+4.353,ein
+4.002,frå
+3.197,er
+3.110,til
+3.056,vart
+2.952,som
+2.607,eit
+2.264,på


Lastly, you will condact a small ablation study. First, choose the two languages for which the classifier worked best.
Next, re-fit the best working model several times, each time reducing the number of characters per instance in the
training set (1. All characters, 2. 500 characters, 3. 100 characters). How does the ablation affect the
performance of the classifier?

##Ablation study

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score

# Custom transformer to reduce text length
class TextReducer(BaseEstimator, TransformerMixin):
    def __init__(self, max_len):
        self.max_len = max_len

    def fit(self, X, y=None):
        return self

    def _reduce_text(self, text):
        """Reduce text length to a maximum of max_len characters."""
        if len(text) > self.max_len:
            text = text[:self.max_len]
        return text

    def transform(self, X, y=None):
        """Apply text reduction to each text in the input array."""
        return [self._reduce_text(text) for text in X]

# Re-create the best model using the best parameters from the grid search
best_model = LogisticRegression(
    solver=grid.best_params_['CLF__solver'],
    penalty=grid.best_params_['CLF__penalty'],
    max_iter=grid.best_params_['CLF__max_iter']
)

# Pipeline for text reduced to 500 characters
pipe500 = Pipeline(steps=[
    ('TextReducer', TextReducer(max_len=500)),
    ('TextNormalizer', TextNormalizer()),
    ('FeatureExtractor', FeatureExtractor()),
    ('Vectorizer', VectorizerWrapper()),
    ('MatrixToArrayConverter', MatrixToArrayConverter()),
    ('MatrixUnifier', MatrixUnifier()),
    ('CLF', best_model)
], verbose=True)

# Pipeline for text reduced to 100 characters
pipe100 = Pipeline(steps=[
    ('TextReducer', TextReducer(max_len=100)),
    ('TextNormalizer', TextNormalizer()),
    ('FeatureExtractor', FeatureExtractor()),
    ('Vectorizer', VectorizerWrapper()),
    ('MatrixToArrayConverter', MatrixToArrayConverter()),
    ('MatrixUnifier', MatrixUnifier()),
    ('CLF', best_model)
], verbose=True)

# Function to fit model and evaluate F1 score
def fit_and_evaluate(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    return f1_score(preds, y_test, average='micro')

# Fit and evaluate the model with text reduced to 500 characters
f1_micro_500 = fit_and_evaluate(pipe500, train_df['text'].to_numpy(), y_train_encoded, test_df['text'].to_numpy(), y_test_encoded)
print(f'F1-micro-score with text reduced to 500 characters: {f1_micro_500}')

# Fit and evaluate the model with text reduced to 100 characters
f1_micro_100 = fit_and_evaluate(pipe100, train_df['text'].to_numpy(), y_train_encoded, test_df['text'].to_numpy(), y_test_encoded)
print(f'F1-micro-score with text reduced to 100 characters: {f1_micro_100}')

# Evaluate the model with all characters (no reduction)
pipe_all = Pipeline(steps=[
    ('TextNormalizer', TextNormalizer()),
    ('FeatureExtractor', FeatureExtractor()),
    ('Vectorizer', VectorizerWrapper()),
    ('MatrixToArrayConverter', MatrixToArrayConverter()),
    ('MatrixUnifier', MatrixUnifier()),
    ('CLF', best_model)
], verbose=True)

f1_micro_all = fit_and_evaluate(pipe_all, train_df['text'].to_numpy(), y_train_encoded, test_df['text'].to_numpy(), y_test_encoded)
print(f'F1-micro-score with all characters: {f1_micro_all}')


[Pipeline] ....... (step 1 of 7) Processing TextReducer, total=   0.0s
[Pipeline] .... (step 2 of 7) Processing TextNormalizer, total=   0.9s
[Pipeline] .. (step 3 of 7) Processing FeatureExtractor, total=   5.5s
[Pipeline] ........ (step 4 of 7) Processing Vectorizer, total=   7.4s
[Pipeline]  (step 5 of 7) Processing MatrixToArrayConverter, total=   0.0s
[Pipeline] ..... (step 6 of 7) Processing MatrixUnifier, total=   0.0s
[Pipeline] ............... (step 7 of 7) Processing CLF, total=  23.4s
F1-micro-score with text reduced to 500 characters: 0.9416296296296297
[Pipeline] ....... (step 1 of 7) Processing TextReducer, total=   0.0s
[Pipeline] .... (step 2 of 7) Processing TextNormalizer, total=   0.2s
[Pipeline] .. (step 3 of 7) Processing FeatureExtractor, total=   1.4s
[Pipeline] ........ (step 4 of 7) Processing Vectorizer, total=   1.8s
[Pipeline]  (step 5 of 7) Processing MatrixToArrayConverter, total=   0.1s
[Pipeline] ..... (step 6 of 7) Processing MatrixUnifier, total=   0.0

### Part 2 - Your first Neural Network

Let’s see if you can beat the best linear model you’ve trained with sklearn with a simple neural network using
skorch.

In [None]:
!pip install skorch


Collecting skorch
  Downloading skorch-1.0.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.4/239.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-1.0.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from skorch import NeuralNetClassifier
import torch
from torch import nn
import torch.nn.functional as F

# Ensure reproducibility
torch.manual_seed(0)
torch.cuda.manual_seed(0)

# Custom text normalizer to preprocess text
class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def _normalize_text(self, text):
        """Remove punctuation, lowercase, pad with spaces."""
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\n', r'', text)
        return text

    def transform(self, X, y=None):
        return [self._normalize_text(text) for text in X]

# Custom feature extractor to add linguistic features
class FeatureExtractor(BaseEstimator, TransformerMixin):
    vowels = set('aeiouäöüàéèëï')
    consonants = set('bcdfghklmnlpqrstvwxyz')

    def __init__(self):
        self.scaler = MinMaxScaler()

    def _to_bigrams(self, text):
        return [bg[0] + bg[1] for bg in zip(text, text[1:])]

    def _get_vowel_consonant_ratio(self, text):
        vf, cf = 0, 0
        for c in text.lower():
            if c in self.vowels:
                vf += 1
            elif c in self.consonants:
                cf += 1
        return vf / (cf + 1)

    def _get_capitalization_ratio(self, text):
        up_count = sum(1 for c in text if c.isupper())
        return up_count / (len(text) + 1)

    def _get_double_char_freq(self, text):
        return sum(1 for bg in self._to_bigrams(text) if bg[0] == bg[1])

    def _extract_num_features(self, texts):
        num_features = []
        for text in texts:
            features = [
                self._get_vowel_consonant_ratio(text),
                self._get_capitalization_ratio(text),
                self._get_double_char_freq(text)
            ]
            num_features.append(features)
        return np.array(num_features)

    def fit(self, X, y=None):
        numerical_features = self._extract_num_features(X)
        self.scaler.fit(numerical_features)
        return self

    def transform(self, X, y=None):
        numerical_features = self._extract_num_features(X)
        return X, self.scaler.transform(numerical_features)

# Wrapper to combine vectorizer and additional features
class VectorizerWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.countvec = TfidfVectorizer(ngram_range=(1, 2), max_features=500)

    def fit(self, X, y=None):
        texts, _ = X
        self.countvec.fit(texts)
        return self

    def transform(self, X, y=None):
        texts, numerical_features = X
        return self.countvec.transform(texts), numerical_features

# Transformer to convert sparse matrix to dense array
class MatrixToArrayConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[0].toarray(), X[1]

# Transformer to unify text and numerical features into a single array
class MatrixUnifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        unified = np.concatenate([X[0], X[1]], axis=1)
        unified = unified.astype(np.float32)
        return unified

# Define the neural network module
class ClassifierModule(nn.Module):
    def __init__(self, input_size=600, num_units=200, num_classes=2, nonlin=F.relu, dropout=0.5):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin
        self.dropout = dropout

        self.dense0 = nn.Linear(input_size, num_units)
        self.nonlin = nonlin
        self.dropout = nn.Dropout(dropout)
        self.dense1 = nn.Linear(num_units, 50)
        self.output = nn.Linear(50, num_classes)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = F.relu(self.dense1(X))
        X = self.output(X)
        return X.squeeze(dim=1)

# Define input size and number of classes
input_size = 503  # Adjusted to match the number of features
num_classes = len(le_fitted.classes_)

# Initialize the neural network classifier with Skorch
net = NeuralNetClassifier(
    ClassifierModule(input_size=input_size, num_classes=num_classes),
    max_epochs=30,
    criterion=nn.CrossEntropyLoss(),
    lr=0.1,
    # device='cuda',  # Uncomment this to train with CUDA
)

# Create the pipeline
pipe = Pipeline(steps=[
    ('TextNormalizer', TextNormalizer()),
    ('FeatureExtractor', FeatureExtractor()),
    ('Vectorizer', VectorizerWrapper()),
    ('MatrixToArrayConverter', MatrixToArrayConverter()),
    ('MatrixUnifier', MatrixUnifier()),
    ('net', net)
], verbose=True)

# Fit the model
pipe.fit(train_df['text'].to_numpy(), y_train_encoded)

# Predict on the test set
preds = pipe.predict(test_df['text'].to_numpy())

# Calculate F1 scores
f1_micro = f1_score(preds, y_test_encoded, average='micro')
f1_macro = f1_score(preds, y_test_encoded, average='macro')
print(f'F1-micro-score on the test set: {f1_micro}')
print(f'F1-macro-score on the test set: {f1_macro}')


[Pipeline] .... (step 1 of 6) Processing TextNormalizer, total=   2.7s
[Pipeline] .. (step 2 of 6) Processing FeatureExtractor, total=  14.8s
[Pipeline] ........ (step 3 of 6) Processing Vectorizer, total=  11.5s
[Pipeline]  (step 4 of 6) Processing MatrixToArrayConverter, total=   0.1s
[Pipeline] ..... (step 5 of 6) Processing MatrixUnifier, total=   0.0s
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.2970[0m       [32m0.0774[0m        [35m3.2926[0m  0.8329
      2        [36m3.2897[0m       [32m0.1344[0m        [35m3.2850[0m  0.6929
      3        [36m3.2814[0m       [32m0.2341[0m        [35m3.2753[0m  0.6830
      4        [36m3.2695[0m       [32m0.3041[0m        [35m3.2608[0m  0.7295
      5        [36m3.2518[0m       [32m0.4385[0m        [35m3.2380[0m  0.7979
      6        [36m3.2219[0m       [32m0.5511[0m        [35m3.1973[0m  0.8010
      7        [36m3.1