In [2]:
import os
from random import randint
import pandas as pd
import seaborn as sns

In [3]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
vanilla = os.path.join(folder,'ngram','vanilla')
unk = os.path.join(folder,'ngram','unk')

In [4]:
df = pd.read_csv(os.path.join(folder,'norm_korpus_clean.csv'))
words = df['Word'].to_numpy()

In [5]:
def generate_text(phrase:str, model:str, n:int):
    
    if model == 'vanilla' or model == 'laplace':
        model_path = vanilla
    elif model == 'unk': 
        model_path = unk
    else: raise Exception('Model does not exist!')
    
    print("Loading Models: ",end='')
    if n in [1,2,3]:
        xgrams = ['unigram.csv','bigram.csv','trigram.csv']
        ngram_path = xgrams[n-1]
        
        xgrams_types = ['Unigram','Bigram','Trigram']
        ngram_type = xgrams_types[n-1]
        
        df_ngram = pd.read_csv(os.path.join(model_path, ngram_path))
        
        ngrams = df_ngram[ngram_type].unique()
        
        if n != 1:
            prev_df_gram = pd.read_csv(os.path.join(model_path,xgrams[n-2]))
            prev_ngrams = prev_df_gram[xgrams_types[n-2]].unique()
            
    else: raise Exception('Choose Unigram, Bigram or Trigram!')
    print('[OK]')
    print('Generating Sentence...')
    
    generated_word = ""
    
    if n == 1:
        top_words = df_ngram['Probability'].astype(float).nlargest(50).index
    
    while generated_word != '</s>':
        phrase += ' '
        if n == 1:
            generated_word = df_ngram.iat[top_words[randint(0,49)],0]
            phrase += generated_word
        
        else:
            #Get previous words
            tokens = phrase.split(' ')
            history = ';'.join(tokens[len(tokens)-n:len(tokens)-1])
            
            #Find most probable words that follow previous words
            if model == 'vanilla' or history in prev_ngrams:
                top_words = df_ngram[df_ngram[ngram_type].str.startswith(history)]['Probability'].astype(float).nlargest(50).index
            
            #If there is no match for the history, then laplace smoothing comes in.
            #Laplace smoothing will ensure that there is an occurence with the previous history at least once 
            #with every other word. Hence the next word becomes essentially a random guess.
            else:
                if model == 'unk':
                    #Replace unkown words with UNK and recalculate accordingly
                    new_history = ['<UNK>' if w not in words else w for w in history.split(';')]
                    history = ';'.join(new_history)
                    
                    #Attempt to find any nrgams with the <UNK> modified history
                    if history in prev_ngrams:
                        top_words = df_ngram[df_ngram[ngram_type].str.startswith(history)]['Probability'].astype(float).nlargest(50).index
                    
                    #If there is still no combination with the UNK words, then take a random guess due to laplace smoothing
                    else: top_words = [randint(0,len(df_ngram)-1)] 
                        
                    
                else: top_words = [randint(0,len(df_ngram)-1)]
            
            
            #Pick a random word from the top 50.
            generated_word = df_ngram.iat[top_words[randint(0,len(top_words)-1)],0].split(';')[-1]
            #Add to current phrase
            phrase += generated_word
            
    return phrase


In [5]:
%%time
generate_text('Jiena kont', 'vanilla', 1)

Loading Models: [OK]
Generating Sentence...
Wall time: 132 ms


"Jiena kont dawn kif li L- xi L- kien l- kif jkun qed qed għall- għal jew l- fl- fuq ma' id- kull se fuq minn is- xi ma l- aktar lil din wara ma' oħra b' biex fuq L- biex se dawn hemm L- il- kien is- jkun id- il- it- hemm jew sena ir- biex lill- lil hemm L- din oħra tiegħu hemm jkun tal- dawn lill- lill- dan ma fl- jkun sena u dwar aktar xi sena ma se ir- ħafna <s> b' dwar fuq minn b' xi għal lill- għall- sena f' fuq oħra mill- lil 1 <s> li kien jew din il- ħafna xi u se xi tiegħu oħra minn meta 1 għal <s> sena meta dan dan f' jew jkun L- it- dwar biex kull dawn L- ta' qed kull biex li sena biex tal- kull il- f' għal tiegħu se L- kien għal u f' jew tiegħu tiegħu is- is- ħafna dawn fil- fuq għall- lil b' ir- hemm it- meta ma f' kif f' jew sena ir- fil- meta ħafna minn kif lill- fuq tal- dawn b' fuq jkun <s> is- <s> fuq biex xi fuq tiegħu fil- ma' is- b' aktar it- ta' ir- 1 l- Il- aktar lill- fl- Il- lill- dwar minn tal- f' tal- f' dak kif meta dan jew lill- fl- ma' ta' <s> se u ir- jkun

In [6]:
%%time
generate_text('Jiena kont', 'vanilla', 2)

Loading Models: [OK]
Generating Sentence...
Wall time: 5.12 s


'Jiena kont li Mark Tulius Cicerus 106 kif jiġi mill- familja li </s>'

In [15]:
%%time
generate_text('Jiena kont', 'vanilla', 3)

Loading Models: [OK]
Generating Sentence...
Jiena;kont
kont;għadni
għadni;skolastiku
skolastiku;hu
hu;kien
kien;jagħmel
Wall time: 8.72 s


'Jiena kont għadni skolastiku hu kien jagħmel </s>'

In [8]:
%%time
generate_text('Jiena kont', 'laplace', 1)

Loading Models: [OK]
Generating Sentence...
Wall time: 142 ms


'Jiena kont sena jew din meta se u se dawn li oħra dak lil is- minn </s>'

In [9]:
%%time
generate_text('Jiena kont', 'laplace', 2)

Loading Models: [OK]
Generating Sentence...
Wall time: 5.94 s


"Jiena kont f' pajjiżi mhux talli qatt x- xewqa sempliċi ikunu determinati f' isem philodendron </s>"

In [10]:
%%time
generate_text('Jiena kont', 'laplace', 3)

Loading Models: [OK]
Generating Sentence...
Wall time: 49.1 s


"Jiena kont naħseb jien faqqiegħ li qed jiġri biex jitla' fuq \xad vapur irid juri bil- fatti ta' dak hawn Malta kontra l- Iskozja ssir membru tal- unjoni monetarja u ekonomika il- fatt dwar kif għandu jitqies bħala persuna b' saħħitha f' dan 1 għan tiġi l- Ħadd li għadda Malta għelbet lill- goalkeeper avversarju għal skor ta' 5 sena ma ukoll ma bħala mistenni </s>"

In [11]:
%%time
generate_text('Jiena kont', 'unk', 1)

Loading Models: [OK]
Generating Sentence...
Wall time: 47.6 ms


"Jiena kont dwar dak u għall- fil- lil ma' meta L- il- fil- minn fuq wara ma wara dwar dak meta jkun dak dan </s>"

In [12]:
%%time
generate_text('Jiena kont', 'unk', 2)

Loading Models: [OK]
Generating Sentence...
Wall time: 7.95 s


'Jiena kont għaliex jekk huwa parti fejn dawn ġew <UNK> of l- ħajja normali stabbilit għall- finanzi illi ruħha kull u għal bl- ATT XX </s>'

In [13]:
%%time
generate_text('Jiena kont', 'unk', 3)

Loading Models: [OK]
Generating Sentence...
Wall time: 24.3 s


"Jiena kont għadni żgħir niġri isfel stess lit- tarbija għaċ- ċajt Offi mifhum mozzjoni Abu trattament ażjenda men mhux ikunu ma' grupp żgħażagħ jorganizzaw numru ta' pazjenti bl- iskizofrenija kif ukoll fl- Istitut Mediterranju u fid- dekasteru tagħha </s>"

In [27]:
generate_text('Il- Laburisti', 'unk', 3)

Loading Models: [OK]
Generating Sentence...


"Il- Laburisti wkoll riedu l- permess mill- Ministru ta' qabel fis- 26 u 27 ta' Diċembru l- ġenituri kienu ġew trasferiti lill- Awtorità dik l- era </s>"

In [24]:
generate_text('In- Nazzjonalisti', 'unk', 3)

Loading Models: [OK]
Generating Sentence...


"In- Nazzjonalisti qatt ma ltqajt ma' qaddisa ferħana Chiara kienet mistiedna għall- festival tal- inbid </s>"

In [34]:
generate_text('Nhar it-', 'unk', 3)

Loading Models: [OK]
Generating Sentence...


"Nhar it- Tnejn 10 ta' Lulju 1994 li jistabbillixxi </s>"

In [11]:
generate_text('Għawdex huwa', 'unk', 3)

Loading Models: [OK]
Generating Sentence...


'Għawdex huwa servizz skond in- nomeklatura tal- Komunità għandu jiġi sottomess il- parteċipant </s>'