In [2]:
from LanguageClassifier import LanguageClassifier
from re                 import sub
from read_csv           import read_train, read_test
from sklearn.metrics    import precision_score, recall_score

## Define structures for storing classifiers and predictions 

In [3]:
pc:            str                           = "polish_czech"
sp:            str                           = "spanish_portugese"
classifiers:   dict[str, LanguageClassifier] = dict()
predictions_1: dict[str, list[str]]          = dict()
predictions_2: dict[str, list[str]]          = dict()
predictions_3: dict[str, list[str]]          = dict()
predictions_4: dict[str, list[str]]          = dict()
actual:        dict[str, list[str]]          = dict()

## Clean the train and test data

In [4]:
for dataset in ["train", "test"]:
    with open(f"Language_datasets/{sp}_{dataset}.csv", "r") as f:
        lines: str = f.read()
        lines      = sub(r"\n(\d+)[,\t](?!spa)(?!por).*", r"",           lines)
        lines      = sub(r"\n(\d+)[^,](spa|por)[^,](.*)", r"\n\1,\2,\3", lines)
    with open(f"Language_datasets/{sp}_{dataset}.csv", "w") as f:
        f.write(lines)
for dataset in ["train", "test"]:
    with open(f"Language_datasets/{pc}_{dataset}.csv", "r") as f:
        lines: str = f.read()
        lines      = sub(r"\n(\d+)[,\t](?!pol)(?!ces).*", r"",           lines)
        lines      = sub(r"\n(\d+)[^,](pol|ces)[^,](.*)", r"\n\1,\2,\3", lines)
    with open(f"Language_datasets/{pc}_{dataset}.csv", "w") as f:
        f.write(lines)

## Train the classifiers and then use them to predict the test data
Takes about 8 minutes total, for both language pairs on my machine (11th gen i5, 8 cores, 2.4GHz)

In [5]:
for langs in [pc, sp]:
    print(f"Starting training for {langs}")
    train_set: list[str]    = read_train(f"Language_datasets/{langs}_train.csv")
    classifiers[langs]      = LanguageClassifier(train_set)
    test_set:  list[str]
    test_set, actual[langs] = read_test(f"Language_datasets/{langs}_test.csv")
    print("Making predictions")
    predictions_1[langs]    = [langs.split('_')[classifiers[langs].classify(text, ignore=[2, 3, 4])] for text in test_set]
    predictions_2[langs]    = [langs.split('_')[classifiers[langs].classify(text, ignore=[1, 3, 4])] for text in test_set]
    predictions_3[langs]    = [langs.split('_')[classifiers[langs].classify(text, ignore=[1, 2, 4])] for text in test_set]
    predictions_4[langs]    = [langs.split('_')[classifiers[langs].classify(text, ignore=[1, 2, 3])] for text in test_set]

Starting training for polish_czech
	Making classifier
	Setting up languages
Making predictions
Starting training for spanish_portugese
	Making classifier
	Setting up languages
Making predictions


### Sanity check

In [6]:
train_set[0][:10]

'¿eres ciud'

## Q1

In [7]:
for langs in [pc, sp]:
    ngrams: list[dict[str, list[int]]] = [
        {key: [0]*len(classifiers[langs].languages) for key in classifiers[langs].unigrams},
        {key: [0]*len(classifiers[langs].languages) for key in classifiers[langs].bigrams},
        {key: [0]*len(classifiers[langs].languages) for key in classifiers[langs].trigrams},
        {key: [0]*len(classifiers[langs].languages) for key in classifiers[langs].quadgrams}
    ]

    with open(f"output/Q1/unigrams_{langs}.csv", "w") as f:
        for i, lang in enumerate(classifiers[langs].languages):
            for unigram in lang.unigrams:
                ngrams[0][unigram][i] = lang.unigrams[unigram]
        f.write(f"Unigram,{','.join(langs.split('_'))}\n")
        for unigram in ngrams[0]:
            f.write(f"{unigram},{','.join(map(str, ngrams[0][unigram]))}\n")

    with open(f"output/Q1/bigrams_{langs}.csv", "w") as f:
        for i, lang in enumerate(classifiers[langs].languages):
            for bigram in lang.bigrams:
                ngrams[1][bigram][i] = lang.bigrams[bigram]
        f.write(f"Bigram,{','.join(langs.split('_'))}\n")
        for bigram in ngrams[1]:
            f.write(f"{bigram},{','.join(map(str, ngrams[1][bigram]))}\n")

    with open(f"output/Q1/trigrams_{langs}.csv", "w") as f:
        for i, lang in enumerate(classifiers[langs].languages):
            for trigram in lang.trigrams:
                ngrams[2][trigram][i] = lang.trigrams[trigram]
        f.write(f"Trigram,{','.join(langs.split('_'))}\n")
        for trigram in ngrams[2]:
            f.write(f"{trigram},{','.join(map(str, ngrams[2][trigram]))}\n")

    with open(f"output/Q1/quadgrams_{langs}.csv", "w") as f:
        for i, lang in enumerate(classifiers[langs].languages):
            for quadgram in lang.quadgrams:
                ngrams[3][quadgram][i] = lang.quadgrams[quadgram]
        f.write(f"Quadgram,{','.join(langs.split('_'))}\n")
        for quadgram in ngrams[3]:
            f.write(f"{quadgram},{','.join(map(str, ngrams[3][quadgram]))}\n")

## Q2

In [13]:
for langs in [pc, sp]:
    print(f"Precision and recall for {langs}")
    print(f"| ngrams    | Precision          | Recall            |")
    print(f"|-----------|--------------------|-------------------|")
    print(f"| Unigrams  | {precision_score(actual[langs], predictions_1[langs], average='weighted', zero_division=0)} | {recall_score(actual[langs], predictions_1[langs], average='weighted', zero_division=0)} |")
    print(f"| Bigrams   | {precision_score(actual[langs], predictions_2[langs], average='weighted', zero_division=0)} | {recall_score(actual[langs], predictions_2[langs], average='weighted', zero_division=0)} |")
    print(f"| Trigrams  | {precision_score(actual[langs], predictions_3[langs], average='weighted', zero_division=0)} | {recall_score(actual[langs], predictions_3[langs], average='weighted', zero_division=0)} |")
    print(f"| Quadgrams | {precision_score(actual[langs], predictions_4[langs], average='weighted', zero_division=0)} | {recall_score(actual[langs], predictions_4[langs], average='weighted', zero_division=0)} |")
    print()

Precision and recall for polish_czech
| ngrams    | Precision          | Recall            |
|-----------|--------------------|-------------------|
| Unigrams  | 0.9954357721590916 | 0.995428476435874 |
| Bigrams   | 0.9980883061493411 | 0.9980882719640928 |
| Trigrams  | 0.9988918539039896 | 0.9988917518632422 |
| Quadgrams | 0.9883530343793918 | 0.9883356883606239 |

Precision and recall for spanish_portugese
| ngrams    | Precision          | Recall            |
|-----------|--------------------|-------------------|
| Unigrams  | 0.96665866723783 | 0.9666193828815346 |
| Bigrams   | 0.990241912735074 | 0.9902395856378756 |
| Trigrams  | 0.9952695962016461 | 0.9952694525058237 |
| Quadgrams | 0.8252179436028418 | 0.8222173058653583 |



In [9]:
for langs in [pc, sp]:
    with open(f"output/Q2/{langs}_actual.txt", 'w') as f:
        for i in range(len(actual[langs])):
            f.write(f"{actual[langs][i]}\n")
    with open(f"output/Q2/{langs}_pred1.txt", 'w') as f:
        for i in range(len(predictions_1[langs])):
            f.write(f"{predictions_1[langs][i]}\n")
    with open(f"output/Q2/{langs}_pred2.txt", 'w') as f:
        for i in range(len(predictions_2[langs])):
            f.write(f"{predictions_2[langs][i]}\n")
    with open(f"output/Q2/{langs}_pred3.txt", 'w') as f:
        for i in range(len(predictions_3[langs])):
            f.write(f"{predictions_3[langs][i]}\n")
    with open(f"output/Q2/{langs}_pred4.txt", 'w') as f:
        for i in range(len(predictions_4[langs])):
            f.write(f"{predictions_4[langs][i]}\n")