## Imports i definició de les variables d'arxius
---

In [3]:

import re #Regexs
import os #Acces a fitxers

import nltk #Tokenització
nltk.download('punkt') #Tokenització

import numpy as np #Matriu de confusió
import seaborn as sns #Matriu de confusió
import matplotlib.pyplot as plt #Matriu de confusió
import pandas as pd #Matriu de confusió

train_files_list = [f for f in os.listdir('corpora') if re.search(r'_trn\.txt$', f)]
test_files_list = [f for f in os.listdir('corpora') if re.search(r'_tst\.txt$', f)]

print(train_files_list)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\albert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['deu_trn.txt', 'eng_trn.txt', 'fra_trn.txt', 'ita_trn.txt', 'nld_trn.txt', 'spa_trn.txt']


## Preprocessing
---

· Eliminar digits del text 
· Convertir tot el text a minuscula
· Eliminar caracters especials
· Substitueix els espais en blanc continus per un de sol
· Concatena totes les frases amb un espai doble al mig


In [4]:

def preprocessing(text):
    
    #text = re.sub(r'[^\w\s]', '', text) # Elimina signes de puntuació
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) # Elimina caracters especials
    text = re.sub(r'\d+', '', text) # Elimina números
    text = text.lower() # Converteix a minúscules
    text = re.sub(r'[\s\n]+', ' ', text) # Elimina espais en blanc sobrants excepte \n
    text = re.sub(r' +', ' ', text) # Elimina espais en blanc llargs
    text = re.sub(r'\n', '  ', text) # Substitueix \n per dos espais en blanc
    
    
    
    return text


# Creació dels trigrames de caracters, amb la seva freqüència
---

In [5]:

def character_trigrams_finder(text, flag='trn'):
    finder = nltk.TrigramCollocationFinder.from_words(text)
    if flag == 'trn':
        finder.apply_freq_filter(5)
    trigrams = finder.ngram_fd.items()
    trigrams = sorted(trigrams, key=lambda x: x[1], reverse=True)

    return trigrams

In [6]:
def word_trigrams_finder(text, flag='trn'):
    finder = nltk.TrigramCollocationFinder.from_words(nltk.word_tokenize(text))
    if flag == 'trn':
        finder.apply_freq_filter(5)
    trigrams = finder.ngram_fd.items()
    trigrams = sorted(trigrams, key=lambda x: x[1], reverse=True)

    return trigrams

In [7]:
def trigram_assigner_ch(train_files_list):
    
    enclosed_trigrams_ch = {}

    for file in train_files_list:
        with open('corpora/' + file, 'r', encoding='utf-8') as f:
            language_id = file.split('_')[0] # ID de l'idioma
            print(language_id)
            text = f.read()
            text = preprocessing(text)
            enclosed_trigrams_ch[language_id] = character_trigrams_finder(text)

    return enclosed_trigrams_ch


In [8]:
def trigram_assigner_wds(train_files_list):
    
    enclosed_trigrams_wds = {}

    for file in train_files_list:
        with open('corpora/' + file, 'r', encoding='utf-8') as f:
            language_id = file.split('_')[0] # ID de l'idioma
            print(language_id)
            text = f.read()
            text = preprocessing(text)
            enclosed_trigrams_wds[language_id] = word_trigrams_finder(text)
            
    return enclosed_trigrams_wds

In [9]:
trigrams_ch = trigram_assigner_ch(train_files_list)

deu
eng
fra
ita
nld
spa


In [10]:
trigrams_ch

{'deu': [(('e', 'n', ' '), 72313),
  (('e', 'r', ' '), 45661),
  ((' ', 'd', 'e'), 30150),
  (('d', 'e', 'r'), 23029),
  (('i', 'e', ' '), 22891),
  (('i', 'c', 'h'), 21343),
  (('e', 'i', 'n'), 21122),
  (('s', 'c', 'h'), 20542),
  ((' ', 'd', 'i'), 19938),
  (('d', 'i', 'e'), 19806),
  (('c', 'h', ' '), 19533),
  (('n', ' ', 'd'), 18279),
  (('i', 'n', ' '), 16111),
  (('n', 'd', ' '), 16089),
  ((' ', 'e', 'i'), 15015),
  (('c', 'h', 'e'), 14996),
  ((' ', 'b', 'e'), 14890),
  ((' ', 'u', 'n'), 14595),
  (('d', 'e', 'n'), 14231),
  (('t', 'e', 'n'), 13748),
  ((' ', 'a', 'u'), 13621),
  ((' ', 'd', 'a'), 13574),
  (('u', 'n', 'd'), 13388),
  (('c', 'h', 't'), 12332),
  (('t', 'e', ' '), 12222),
  (('i', 'n', 'e'), 12093),
  (('g', 'e', 'n'), 12082),
  ((' ', 'i', 'n'), 12050),
  ((' ', 'g', 'e'), 11990),
  (('n', ' ', 's'), 10927),
  (('t', 'e', 'r'), 10872),
  (('u', 'n', 'g'), 10807),
  (('e', 's', ' '), 10784),
  (('n', 'd', 'e'), 10282),
  ((' ', 's', 'i'), 9978),
  (('t', ' ', 

## Separar el test per oracions
---

In [11]:

def test_files_separator(test_files_list):
    test_files = {}
    for file in test_files_list:
        with open('corpora/' + file, 'r', encoding='utf-8') as f:
            language_id = file.split('_')[0] # ID de l'idioma
            print(language_id)
            text = f.read()
            test_files[language_id] = nltk.sent_tokenize(text)
            for i in range(len(test_files[language_id])):
                test_files[language_id][i] = preprocessing(test_files[language_id][i])
    return test_files

In [12]:
test_files = test_files_separator(test_files_list)


deu
eng
fra
ita
nld
spa


In [13]:
test_files['deu'][0]

' dgapadhoc accu holding ag einladung zur generalversammlung und ernennung eines weiteren mitglieds der konzernleitung deutsch '

## Funcio de suavitzat
---


Sumatori del logaritme de (les aparicions del trigrama en el train + alpha / trigrames que apareixen en el train + alpha * vocabulari(beta))

In [14]:
def suavitzat(oracio):
    #chars es una lista, trigrams_ch es un dict
    chars = character_trigrams_finder(oracio, 'test')
    probabilities = {}
    for idioma in trigrams_ch: #iterem per idioma
        prob = 0
        for i in range(len(chars)):
            alpha = 0.5
            beta = (len(trigrams_ch[idioma]))
            aparicions = 0
            for elem in trigrams_ch[idioma]: #busco si el trigrama existe en el idioma
                if chars[i][0] == elem[0]:
                    aparicions = elem[1]
                    break
                else:
                    aparicions = 0
            prob_trigrama = ((aparicions + alpha) / ((len(trigrams_ch[idioma])) + alpha * beta)) # la nostra beta (trigrames totals del idioma) es el nombre de trigrames totals del train
            prob += np.log(prob_trigrama)
        probabilities[idioma] = prob
    return probabilities
#esta pal Enzo

In [17]:
suavitzat(test_files['nld'][900])

{'deu': -166.48668047719258,
 'eng': -160.85951690560847,
 'fra': -194.9073351684477,
 'ita': -212.75222941953422,
 'nld': -115.44452367860598,
 'spa': -194.20662938428356}