# In this notebook I'll be performing preprocessing on the dataset. 
#### Input: Raw data
#### Output: Train/ Test Data, Test Labels

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from collections import Counter
from filterpy.discrete_bayes import update, predict, normalize
import re
import string
import nltk

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
#import raw data
df = pd.read_csv('../data/raw/formatted_data.csv',sep=';')

In [4]:
df

Unnamed: 0,language,text,length_text
0,bg,Състав на Парламента: вж. протоколиОдобряване ...,327263
1,cs,Schválení zápisu z předchozího zasedání: viz z...,317927
2,da,Genoptagelse af sessionenJeg erklærer Europa-P...,678400
3,de,Wiederaufnahme der SitzungsperiodeIch erkläre ...,747690
4,el,Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...,523277
5,en,Resumption of the sessionI declare resumed the...,690268
6,es,Reanudación del período de sesionesDeclaro rea...,733658
7,et,Eelmise istungi protokolli kinnitamine vaata p...,324119
8,fi,Istuntokauden uudelleenavaaminen Julistan perj...,694523
9,fr,Reprise de la sessionJe déclare reprise la ses...,756201


In [5]:
#Sentence Tokenize
df['sent_tokenize'] = df.apply(lambda row: nltk.sent_tokenize(row['text']), axis=1)

In [6]:
df

Unnamed: 0,language,text,length_text,sent_tokenize
0,bg,Състав на Парламента: вж. протоколиОдобряване ...,327263,"[Състав на Парламента: вж., протоколиОдобряван..."
1,cs,Schválení zápisu z předchozího zasedání: viz z...,317927,[Schválení zápisu z předchozího zasedání: viz ...
2,da,Genoptagelse af sessionenJeg erklærer Europa-P...,678400,[Genoptagelse af sessionenJeg erklærer Europa-...
3,de,Wiederaufnahme der SitzungsperiodeIch erkläre ...,747690,[Wiederaufnahme der SitzungsperiodeIch erkläre...
4,el,Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...,523277,[Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη τη...
5,en,Resumption of the sessionI declare resumed the...,690268,[Resumption of the sessionI declare resumed th...
6,es,Reanudación del período de sesionesDeclaro rea...,733658,[Reanudación del período de sesionesDeclaro re...
7,et,Eelmise istungi protokolli kinnitamine vaata p...,324119,[Eelmise istungi protokolli kinnitamine vaata ...
8,fi,Istuntokauden uudelleenavaaminen Julistan perj...,694523,[Istuntokauden uudelleenavaaminen Julistan per...
9,fr,Reprise de la sessionJe déclare reprise la ses...,756201,[Reprise de la sessionJe déclare reprise la se...


In [7]:
#Split sentences into 80% train, 20% test. I do this in sentence data so that we retain full sentences
train_set=[]
test_set=[]
for i in df['sent_tokenize']:
    train, test = train_test_split(i, test_size = 0.2)
    train_set.append(train)
    test_set.append(test)

In [8]:
#create new columns for train and test sets
df['train'] = train_set
df['test'] = test_set

In [9]:
#sentence count for sanity check
df['train_sentence_count'] = df['train'].apply(lambda x: len(str(x).split('.')))
df['test_sentence_count'] = df['test'].apply(lambda x: len(str(x).split('.')))

### Ok Now I've got my training and testing sets,

I am joining the training set together from sent tokenized format.

In [10]:
#join every sentence in training set. So that I've got a big blob of words to work with
df['train_joined'] = df['train'].str.join(",") 

## Preprocess Training Set

#### Standard NLP preprocessing

In [11]:
#lower case
df['train_processed'] = df['train_joined'].apply(lambda x: x.lower())
#remove numbers
df['train_processed'] = df['train_processed'].apply(lambda x: re.sub("\d+", " ", x))
#remove punctuation
df['train_processed'] = df['train_processed'].apply(lambda x: re.sub('['+string.punctuation+']', '', x))
#remove whitespace
df['train_processed'] = df['train_processed'].apply(lambda x: x.strip())
#word tokenize
df['train_processed'] = df['train_processed'].apply(lambda x: nltk.word_tokenize(x))

In [12]:
#create character ngrams
def word2ngrams(text, n, exact=True):
    """ Convert text into character ngrams. """
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

In [13]:
#nested for loop for lists(words) in list(train_processed)
trigrams = []
for i in df['train_processed']:
    row = []
    for j in i:
        x = word2ngrams(j,3)
        row.append(x)
    #remove empty lists here for words that have less than 3 characters
    row = [x for x in row if x != []]
    trigrams.append(row)

In [14]:
#new column (I know its not always necesssary to do this. It just helps me keep things in order)
df['train_trigrams'] =trigrams

In [15]:
df

Unnamed: 0,language,text,length_text,sent_tokenize,train,test,train_sentence_count,test_sentence_count,train_joined,train_processed,train_trigrams
0,bg,Състав на Парламента: вж. протоколиОдобряване ...,327263,"[Състав на Парламента: вж., протоколиОдобряван...",[Прилагане и тълкуване на правилника изменение...,[Приложимо право към извъндоговорни задължения...,2872,685,Прилагане и тълкуване на правилника изменение ...,"[прилагане, и, тълкуване, на, правилника, изме...","[[при, рил, ила, лаг, ага, ган, ане], [тъл, ъл..."
1,cs,Schválení zápisu z předchozího zasedání: viz z...,317927,[Schválení zápisu z předchozího zasedání: viz ...,[Proto navrhuji věnovat se podrobněji otázce k...,[Peru: Vydání bývalého prezidenta Alberta Fuji...,2124,533,Proto navrhuji věnovat se podrobněji otázce ko...,"[proto, navrhuji, věnovat, se, podrobněji, otá...","[[pro, rot, oto], [nav, avr, vrh, rhu, huj, uj..."
2,da,Genoptagelse af sessionenJeg erklærer Europa-P...,678400,[Genoptagelse af sessionenJeg erklærer Europa-...,"[Men også det skal gennemføres professionelt.,...","[13.05 og genoptaget kl., Jeg tror ikke, at Pa...",4225,1071,"Men også det skal gennemføres professionelt.,V...","[men, også, det, skal, gennemføres, profession...","[[men], [ogs, gså], [det], [ska, kal], [gen, e..."
3,de,Wiederaufnahme der SitzungsperiodeIch erkläre ...,747690,[Wiederaufnahme der SitzungsperiodeIch erkläre...,[Für den neuen ESF-Planungszeitraum 2000-2006 ...,[Vom Berliner Gipfel 1999 blieb dann nur noch ...,3733,964,Für den neuen ESF-Planungszeitraum 2000-2006 s...,"[für, den, neuen, esfplanungszeitraum, sind, w...","[[für], [den], [neu, eue, uen], [esf, sfp, fpl..."
4,el,Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...,523277,[Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη τη...,[Η περιοχή ήταν οικονομικά καθυστερημένη κατά ...,"[καρακάξα, αυτό το γνωστό ασπρόμαυρο πουλί που...",2494,651,Η περιοχή ήταν οικονομικά καθυστερημένη κατά τ...,"[η, περιοχή, ήταν, οικονομικά, καθυστερημένη, ...","[[περ, ερι, ριο, ιοχ, οχή], [ήτα, ταν], [οικ, ..."
5,en,Resumption of the sessionI declare resumed the...,690268,[Resumption of the sessionI declare resumed th...,[There should be greater involvement of the pr...,[High levels of confidence are necessary to bo...,3648,918,There should be greater involvement of the pri...,"[there, should, be, greater, involvement, of, ...","[[the, her, ere], [sho, hou, oul, uld], [gre, ..."
6,es,Reanudación del período de sesionesDeclaro rea...,733658,[Reanudación del período de sesionesDeclaro re...,[Sabemos que en la era de las estrategias de a...,[La mayoría de las enmiendas son exclusivament...,3817,943,Sabemos que en la era de las estrategias de al...,"[sabemos, que, en, la, era, de, las, estrategi...","[[sab, abe, bem, emo, mos], [que], [era], [las..."
7,et,Eelmise istungi protokolli kinnitamine vaata p...,324119,[Eelmise istungi protokolli kinnitamine vaata ...,[Ükski riik ei saa alaliselt säilitada vahendi...,"[Tegemist on seega õiguse ühe liigiga, isegi k...",2141,522,Ükski riik ei saa alaliselt säilitada vahendit...,"[ükski, riik, ei, saa, alaliselt, säilitada, v...","[[üks, ksk, ski], [rii, iik], [saa], [ala, lal..."
8,fi,Istuntokauden uudelleenavaaminen Julistan perj...,694523,[Istuntokauden uudelleenavaaminen Julistan per...,"[Oli miten oli, minun täytyy ilmoittaa teille,...",[Toiseksi kansallisilla tuomioistuimilla on jo...,3649,897,"Oli miten oli, minun täytyy ilmoittaa teille, ...","[oli, miten, oli, minun, täytyy, ilmoittaa, te...","[[oli], [mit, ite, ten], [oli], [min, inu, nun..."
9,fr,Reprise de la sessionJe déclare reprise la ses...,756201,[Reprise de la sessionJe déclare reprise la se...,[La Commission prévoit de les adopter avant le...,[Une plus grande transparence contribuera égal...,3720,930,La Commission prévoit de les adopter avant le ...,"[la, commission, prévoit, de, les, adopter, av...","[[com, omm, mmi, mis, iss, ssi, sio, ion], [pr..."


In [16]:
#char trigram existed within lists(for each word). The following code takes them out of those list. Then for each row(language), I have a list of all the char trigrams present.
trigrams_joined = []
for a in df['train_trigrams']:
    x = [j for i in a for j in i]
    trigrams_joined.append(x)

In [17]:
df['train_trigrams_joined'] = trigrams_joined

In [18]:
df

Unnamed: 0,language,text,length_text,sent_tokenize,train,test,train_sentence_count,test_sentence_count,train_joined,train_processed,train_trigrams,train_trigrams_joined
0,bg,Състав на Парламента: вж. протоколиОдобряване ...,327263,"[Състав на Парламента: вж., протоколиОдобряван...",[Прилагане и тълкуване на правилника изменение...,[Приложимо право към извъндоговорни задължения...,2872,685,Прилагане и тълкуване на правилника изменение ...,"[прилагане, и, тълкуване, на, правилника, изме...","[[при, рил, ила, лаг, ага, ган, ане], [тъл, ъл...","[при, рил, ила, лаг, ага, ган, ане, тъл, ълк, ..."
1,cs,Schválení zápisu z předchozího zasedání: viz z...,317927,[Schválení zápisu z předchozího zasedání: viz ...,[Proto navrhuji věnovat se podrobněji otázce k...,[Peru: Vydání bývalého prezidenta Alberta Fuji...,2124,533,Proto navrhuji věnovat se podrobněji otázce ko...,"[proto, navrhuji, věnovat, se, podrobněji, otá...","[[pro, rot, oto], [nav, avr, vrh, rhu, huj, uj...","[pro, rot, oto, nav, avr, vrh, rhu, huj, uji, ..."
2,da,Genoptagelse af sessionenJeg erklærer Europa-P...,678400,[Genoptagelse af sessionenJeg erklærer Europa-...,"[Men også det skal gennemføres professionelt.,...","[13.05 og genoptaget kl., Jeg tror ikke, at Pa...",4225,1071,"Men også det skal gennemføres professionelt.,V...","[men, også, det, skal, gennemføres, profession...","[[men], [ogs, gså], [det], [ska, kal], [gen, e...","[men, ogs, gså, det, ska, kal, gen, enn, nne, ..."
3,de,Wiederaufnahme der SitzungsperiodeIch erkläre ...,747690,[Wiederaufnahme der SitzungsperiodeIch erkläre...,[Für den neuen ESF-Planungszeitraum 2000-2006 ...,[Vom Berliner Gipfel 1999 blieb dann nur noch ...,3733,964,Für den neuen ESF-Planungszeitraum 2000-2006 s...,"[für, den, neuen, esfplanungszeitraum, sind, w...","[[für], [den], [neu, eue, uen], [esf, sfp, fpl...","[für, den, neu, eue, uen, esf, sfp, fpl, pla, ..."
4,el,Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...,523277,[Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη τη...,[Η περιοχή ήταν οικονομικά καθυστερημένη κατά ...,"[καρακάξα, αυτό το γνωστό ασπρόμαυρο πουλί που...",2494,651,Η περιοχή ήταν οικονομικά καθυστερημένη κατά τ...,"[η, περιοχή, ήταν, οικονομικά, καθυστερημένη, ...","[[περ, ερι, ριο, ιοχ, οχή], [ήτα, ταν], [οικ, ...","[περ, ερι, ριο, ιοχ, οχή, ήτα, ταν, οικ, ικο, ..."
5,en,Resumption of the sessionI declare resumed the...,690268,[Resumption of the sessionI declare resumed th...,[There should be greater involvement of the pr...,[High levels of confidence are necessary to bo...,3648,918,There should be greater involvement of the pri...,"[there, should, be, greater, involvement, of, ...","[[the, her, ere], [sho, hou, oul, uld], [gre, ...","[the, her, ere, sho, hou, oul, uld, gre, rea, ..."
6,es,Reanudación del período de sesionesDeclaro rea...,733658,[Reanudación del período de sesionesDeclaro re...,[Sabemos que en la era de las estrategias de a...,[La mayoría de las enmiendas son exclusivament...,3817,943,Sabemos que en la era de las estrategias de al...,"[sabemos, que, en, la, era, de, las, estrategi...","[[sab, abe, bem, emo, mos], [que], [era], [las...","[sab, abe, bem, emo, mos, que, era, las, est, ..."
7,et,Eelmise istungi protokolli kinnitamine vaata p...,324119,[Eelmise istungi protokolli kinnitamine vaata ...,[Ükski riik ei saa alaliselt säilitada vahendi...,"[Tegemist on seega õiguse ühe liigiga, isegi k...",2141,522,Ükski riik ei saa alaliselt säilitada vahendit...,"[ükski, riik, ei, saa, alaliselt, säilitada, v...","[[üks, ksk, ski], [rii, iik], [saa], [ala, lal...","[üks, ksk, ski, rii, iik, saa, ala, lal, ali, ..."
8,fi,Istuntokauden uudelleenavaaminen Julistan perj...,694523,[Istuntokauden uudelleenavaaminen Julistan per...,"[Oli miten oli, minun täytyy ilmoittaa teille,...",[Toiseksi kansallisilla tuomioistuimilla on jo...,3649,897,"Oli miten oli, minun täytyy ilmoittaa teille, ...","[oli, miten, oli, minun, täytyy, ilmoittaa, te...","[[oli], [mit, ite, ten], [oli], [min, inu, nun...","[oli, mit, ite, ten, oli, min, inu, nun, täy, ..."
9,fr,Reprise de la sessionJe déclare reprise la ses...,756201,[Reprise de la sessionJe déclare reprise la se...,[La Commission prévoit de les adopter avant le...,[Une plus grande transparence contribuera égal...,3720,930,La Commission prévoit de les adopter avant le ...,"[la, commission, prévoit, de, les, adopter, av...","[[com, omm, mmi, mis, iss, ssi, sio, ion], [pr...","[com, omm, mmi, mis, iss, ssi, sio, ion, pré, ..."


In [19]:
# gets value counts of each unique trigram
trigrams_joined_series_value_counts = []
for i in trigrams_joined:
    x = pd.Series(i)
    y = x.value_counts()
    trigrams_joined_series_value_counts.append(y)

In [20]:
#concat list of series
df_new = pd.concat(trigrams_joined_series_value_counts, axis=1)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [21]:
#column names are our languages
df_new.columns = df['language']

In [22]:
df_new

language,bg,cs,da,de,el,en,es,et,fi,fr,...,it,lt,lv,nl,pl,pt,ro,sk,sl,sv
aaa,,,,,,,,5.0,9.0,,...,,,,3.0,,,,,,
aab,,1.0,,1.0,,,,46.0,1.0,,...,5.0,,,,2.0,2.0,,4.0,,
aac,,,,,,,1.0,,,,...,,,,,,,6.0,,,
aad,,,,,,,3.0,113.0,129.0,,...,3.0,,,140.0,,1.0,,,,
aae,,,,,,,,19.0,20.0,,...,,,,,,,,,,
aaf,,,,,,,,13.0,,,...,,,,15.0,,3.0,,,,
aag,,3.0,,,,,1.0,20.0,,,...,,,,242.0,,8.0,1.0,3.0,,
aah,,,,,,,4.0,2.0,9.0,,...,,,,,,,,,,1.0
aai,,,,,,,,15.0,38.0,,...,,10.0,,10.0,,2.0,,,,
aaj,,,,,,,,7.0,127.0,,...,,,,,,,,1.0,,


### I'll call these next few steps the smoothing steps

#### The following is the find the rows where only 1 language was present, then adding 0.5 to that count

In [23]:
#This gets the row counts
df_row_counts = pd.DataFrame(df_new.count(axis=1))
#filter the rows where row counts = 1
df_row_counts =df_row_counts.loc[df_row_counts[0] == 1]
#take the index of those rows and save that as a series
only_1_count_index=df_row_counts.index
#for the trigrams with only appearing in 1 language(appears in the previously mentions series), add 0.5
df_new.loc[df_new.index.isin(only_1_count_index)] = df_new.loc[df_new.index.isin(only_1_count_index)].add(0.5)

In [24]:
df_new

language,bg,cs,da,de,el,en,es,et,fi,fr,...,it,lt,lv,nl,pl,pt,ro,sk,sl,sv
aaa,,,,,,,,5.0,9.0,,...,,,,3.0,,,,,,
aab,,1.0,,1.0,,,,46.0,1.0,,...,5.0,,,,2.0,2.0,,4.0,,
aac,,,,,,,1.0,,,,...,,,,,,,6.0,,,
aad,,,,,,,3.0,113.0,129.0,,...,3.0,,,140.0,,1.0,,,,
aae,,,,,,,,19.0,20.0,,...,,,,,,,,,,
aaf,,,,,,,,13.0,,,...,,,,15.0,,3.0,,,,
aag,,3.0,,,,,1.0,20.0,,,...,,,,242.0,,8.0,1.0,3.0,,
aah,,,,,,,4.0,2.0,9.0,,...,,,,,,,,,,1.0
aai,,,,,,,,15.0,38.0,,...,,10.0,,10.0,,2.0,,,,
aaj,,,,,,,,7.0,127.0,,...,,,,,,,,1.0,,


In [25]:
#to get the probability distribution, divide each column by the column sum
df_train = df_new.div(df_new.sum())

In [26]:
#fill nan with zeroes
df_train = df_train.fillna(0)

I've run into the problem of zeroes when calculating KL divergeance (relative entropy).

https://mathoverflow.net/questions/72668/how-to-compute-kl-divergence-when-pmf-contains-0s

smoothing of training data is essential such as to eliminate  pln(p/0) errors present. where the 0 here is the training data(pmf)

using discrete bayes filter:

https://rlabbe.github.io/blog/2016/02/16/discrete-bayes-filter/

https://filterpy.readthedocs.io/en/latest/discrete_bayes/discrete_bayes.html

I actually decided to do it another way. Adding an infinitesimally small value to each point in the dataframe, then normalizing column-wise

In [27]:
df_train = df_train.add(0.0000001)

In [28]:
df_train = df_train.apply(lambda x: normalize(x))

In [29]:
df_train.to_csv('../data/processed/df_train.csv',index=True)

In [30]:
df.to_csv('../data/processed/df_checkpoint.csv',index=True)

In [31]:
df_saved = df.copy()

# Preprocessing Test set

In [32]:
#Lets select the 2 columns we'll be working with for test set.
df = df[['language','test']]

In [33]:
#Now I want to make each sentence in test set its own row, with the language as the target label
lst_col = 'test'

df = pd.DataFrame({
      col:np.repeat(df[col].values, df[lst_col].str.len())
      for col in df.columns.drop(lst_col)}
    ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]

In [34]:
#These are the the rows with less than 50 characters. Think I will remove them
df[df['test'].map(len) < 50]

Unnamed: 0,language,test
9,bg,Бъдещето на професионалния футбол в Европа вот14.
16,bg,14.
20,bg,протоколиСезиране на комисии: вж.
21,bg,24.
23,bg,протоколаПроверка на пълномощията: вж.
25,bg,Състояние на пчеларството гласуване16.
34,bg,протоколиВнасяне на документи: вж.
50,bg,Исках само да съм сигурен.
53,bg,Оценка на Евратом вот- Informe: Maldeikis11.
58,bg,протоколаДневен ред на следващото заседание: вж.


In [35]:
#remove rows with less than 50 characters
df = df[df['test'].map(len) >= 50]

In [36]:
df

Unnamed: 0,language,test
0,bg,Приложимо право към извъндоговорни задължения ...
1,bg,Тематична стратегия за устойчиво използване на...
2,bg,протоколиПрекъсване на сесиятаJe déclare inter...
3,bg,протоколиТрансфер на бюджетни средстваРед за р...
4,bg,"Това е скандално искане, което не може да бъде..."
5,bg,"- NL Гжо Председател, разискването за устойчив..."
6,bg,Научната дефиниция ще даде възможност продукти...
7,bg,"Конвенция относно компетентността, признаванет..."
8,bg,Направили сме много проучвания.Предлагам ви да...
10,bg,Сключване на съответните споразумения съгласно...


## The following code is copied from the train section. If this model goes to production, I should definitely rewrite all of this into a common function to avoid retyping all of this. As I suspect I would only be repeating the procedure once and since this is the exploration phase, I will just reuse the old code

In [37]:

#lower case
df['test_processed'] = df['test'].apply(lambda x: x.lower())
#remove numbers
df['test_processed'] = df['test_processed'].apply(lambda x: re.sub("\d+", "", x))
#remove punctuation
df['test_processed'] = df['test_processed'].apply(lambda x: re.sub('['+string.punctuation+']', '', x))
#remove whitespace
df['test_processed'] = df['test_processed'].apply(lambda x: x.strip())
#word tokenize
df['test_processed'] = df['test_processed'].apply(lambda x: nltk.word_tokenize(x))

In [38]:
#nested for loop for lists(words) in list(train_processed)
trigrams = []
for i in df['test_processed']:
    row = []
    for j in i:
        x = word2ngrams(j,3)
        row.append(x)
    #remove empty lists here for words that have less than 3 characters
    row = [x for x in row if x != []]
    trigrams.append(row)

In [39]:
len(trigrams)

10023

In [40]:
df['test_trigrams'] = trigrams

In [41]:
df

Unnamed: 0,language,test,test_processed,test_trigrams
0,bg,Приложимо право към извъндоговорни задължения ...,"[приложимо, право, към, извъндоговорни, задълж...","[[при, рил, ило, лож, ожи, жим, имо], [пра, ра..."
1,bg,Тематична стратегия за устойчиво използване на...,"[тематична, стратегия, за, устойчиво, използва...","[[тем, ема, мат, ати, тич, ичн, чна], [стр, тр..."
2,bg,протоколиПрекъсване на сесиятаJe déclare inter...,"[протоколипрекъсване, на, сесиятаje, déclare, ...","[[про, рот, ото, ток, око, кол, оли, лип, ипр,..."
3,bg,протоколиТрансфер на бюджетни средстваРед за р...,"[протоколитрансфер, на, бюджетни, средстваред,...","[[про, рот, ото, ток, око, кол, оли, лит, итр,..."
4,bg,"Това е скандално искане, което не може да бъде...","[това, е, скандално, искане, което, не, може, ...","[[тов, ова], [ска, кан, анд, нда, дал, алн, лн..."
5,bg,"- NL Гжо Председател, разискването за устойчив...","[nl, гжо, председател, разискването, за, устой...","[[гжо], [пре, ред, едс, дсе, сед, еда, дат, ат..."
6,bg,Научната дефиниция ще даде възможност продукти...,"[научната, дефиниция, ще, даде, възможност, пр...","[[нау, ауч, учн, чна, нат, ата], [деф, ефи, фи..."
7,bg,"Конвенция относно компетентността, признаванет...","[конвенция, относно, компетентността, признава...","[[кон, онв, нве, вен, енц, нци, ция], [отн, тн..."
8,bg,Направили сме много проучвания.Предлагам ви да...,"[направили, сме, много, проучванияпредлагам, в...","[[нап, апр, пра, рав, ави, вил, или], [сме], [..."
10,bg,Сключване на съответните споразумения съгласно...,"[сключване, на, съответните, споразумения, съг...","[[скл, клю, люч, ючв, чва, ван, ане], [съо, ъо..."


In [42]:
#each char trigram existed within lists. The following code takes them out of those list. Then for each row(language), I have a list of all the char trigrams present.
trigrams_joined = []
for a in df['test_trigrams']:
    x = [j for i in a for j in i]
    trigrams_joined.append(x)

In [43]:
df['test_trigrams_joined'] = trigrams_joined

In [44]:
df

Unnamed: 0,language,test,test_processed,test_trigrams,test_trigrams_joined
0,bg,Приложимо право към извъндоговорни задължения ...,"[приложимо, право, към, извъндоговорни, задълж...","[[при, рил, ило, лож, ожи, жим, имо], [пра, ра...","[при, рил, ило, лож, ожи, жим, имо, пра, рав, ..."
1,bg,Тематична стратегия за устойчиво използване на...,"[тематична, стратегия, за, устойчиво, използва...","[[тем, ема, мат, ати, тич, ичн, чна], [стр, тр...","[тем, ема, мат, ати, тич, ичн, чна, стр, тра, ..."
2,bg,протоколиПрекъсване на сесиятаJe déclare inter...,"[протоколипрекъсване, на, сесиятаje, déclare, ...","[[про, рот, ото, ток, око, кол, оли, лип, ипр,...","[про, рот, ото, ток, око, кол, оли, лип, ипр, ..."
3,bg,протоколиТрансфер на бюджетни средстваРед за р...,"[протоколитрансфер, на, бюджетни, средстваред,...","[[про, рот, ото, ток, око, кол, оли, лит, итр,...","[про, рот, ото, ток, око, кол, оли, лит, итр, ..."
4,bg,"Това е скандално искане, което не може да бъде...","[това, е, скандално, искане, което, не, може, ...","[[тов, ова], [ска, кан, анд, нда, дал, алн, лн...","[тов, ова, ска, кан, анд, нда, дал, алн, лно, ..."
5,bg,"- NL Гжо Председател, разискването за устойчив...","[nl, гжо, председател, разискването, за, устой...","[[гжо], [пре, ред, едс, дсе, сед, еда, дат, ат...","[гжо, пре, ред, едс, дсе, сед, еда, дат, ате, ..."
6,bg,Научната дефиниция ще даде възможност продукти...,"[научната, дефиниция, ще, даде, възможност, пр...","[[нау, ауч, учн, чна, нат, ата], [деф, ефи, фи...","[нау, ауч, учн, чна, нат, ата, деф, ефи, фин, ..."
7,bg,"Конвенция относно компетентността, признаванет...","[конвенция, относно, компетентността, признава...","[[кон, онв, нве, вен, енц, нци, ция], [отн, тн...","[кон, онв, нве, вен, енц, нци, ция, отн, тно, ..."
8,bg,Направили сме много проучвания.Предлагам ви да...,"[направили, сме, много, проучванияпредлагам, в...","[[нап, апр, пра, рав, ави, вил, или], [сме], [...","[нап, апр, пра, рав, ави, вил, или, сме, мно, ..."
10,bg,Сключване на съответните споразумения съгласно...,"[сключване, на, съответните, споразумения, съг...","[[скл, клю, люч, ючв, чва, ван, ане], [съо, ъо...","[скл, клю, люч, ючв, чва, ван, ане, съо, ъот, ..."


In [45]:
# gets value counts of each unique trigram
trigrams_joined_series_value_counts = []
for i in trigrams_joined:
    x = pd.Series(i)
    y = x.value_counts()
    trigrams_joined_series_value_counts.append(y)

In [46]:
len(trigrams_joined_series_value_counts)

10023

In [47]:
#I do this because I want the index (all the trigrams available) from the training set
tempdf = pd.DataFrame(index=df_new.index)

In [48]:
tempdf

aaa
aab
aac
aad
aae
aaf
aag
aah
aai
aaj
aak


In [49]:
#concat list of series
df_new2 = pd.concat(trigrams_joined_series_value_counts, axis=1)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [50]:
#column names are our languages
df_new2.columns = df['language']

In [51]:
df_new2.shape

(25431, 10023)

In [52]:
df_new2

language,bg,bg.1,bg.2,bg.3,bg.4,bg.5,bg.6,bg.7,bg.8,bg.9,...,sv,sv.1,sv.2,sv.3,sv.4,sv.5,sv.6,sv.7,sv.8,sv.9
aaa,,,,,,,,,,,...,,,,,,,,,,
aab,,,,,,,,,,,...,,,,,,,,,,
aad,,,,,,,,,,,...,,,,,,,,,,
aae,,,,,,,,,,,...,,,,,,,,,,
aaf,,,,,,,,,,,...,,,,,,,,,,
aag,,,,,,,,,,,...,,,,,,,,,,
aah,,,,,,,,,,,...,,,,,,,,,,
aai,,,,,,,,,,,...,,,,,,,,,,
aaj,,,,,,,,,,,...,,,,,,,,,,
aak,,,,,,,,,,,...,,,,,,,,,,


In [53]:
df_new

language,bg,cs,da,de,el,en,es,et,fi,fr,...,it,lt,lv,nl,pl,pt,ro,sk,sl,sv
aaa,,,,,,,,5.0,9.0,,...,,,,3.0,,,,,,
aab,,1.0,,1.0,,,,46.0,1.0,,...,5.0,,,,2.0,2.0,,4.0,,
aac,,,,,,,1.0,,,,...,,,,,,,6.0,,,
aad,,,,,,,3.0,113.0,129.0,,...,3.0,,,140.0,,1.0,,,,
aae,,,,,,,,19.0,20.0,,...,,,,,,,,,,
aaf,,,,,,,,13.0,,,...,,,,15.0,,3.0,,,,
aag,,3.0,,,,,1.0,20.0,,,...,,,,242.0,,8.0,1.0,3.0,,
aah,,,,,,,4.0,2.0,9.0,,...,,,,,,,,,,1.0
aai,,,,,,,,15.0,38.0,,...,,10.0,,10.0,,2.0,,,,
aaj,,,,,,,,7.0,127.0,,...,,,,,,,,1.0,,


In [54]:
#left join my test set onto my training set's index
df_test = tempdf.join(df_new2)

In [55]:
df_test.shape

(37746, 10023)

In [56]:
test_set_labels = pd.DataFrame(df['language'])

In [57]:
test_set_labels.to_csv('../data/processed/test_set_labels.csv',index=True)

In [58]:
#to get the probability distribution, divide each column by the column sum
df_test = df_test.div(df_test.sum())

In [59]:
#fill nan with zeroes
df_test = df_test.fillna(0)

In [60]:
df_test.to_csv('../data/processed/df_test.csv',index=True)