In [1]:
#import library
import numpy as np
import math
import pandas as pd
import re
from collections import Counter
import os

In [2]:
def buildUnigramModel(Text):
    '''
    BUILD UNIGRAM MODEL
    IS : Diberikan input sebuah data berisi text
    FS : Meng-outputkan hasil dari model unigram yang dibuat dalam bentuk dictionary (key: kata; value: probabilitas kemunculan kata tersebut)
    Note : Lakukan proses cleaning dengan menghapus punctuation dan mengubah teks menjadi lower case.
    '''
    words_low = Text.str.lower()
    words= words_low.str.split()
    punctuation = '!"#$%&\'’“”()*+,-./:;<=>?@[\\]^_`{|}~'
    table = str.maketrans('', '', punctuation)
    
    stripped = [None] * len(words)
    for i in range (len(words)):
        stripped[i] = [w.translate(table) for w in words[i]]
    
    cnt = Counter()
    for i in range (len(stripped)):
        for allword in stripped[i]:
            cnt[allword] += 1
        
    dictiUni = dict(cnt)
    
    for allwords in (dictiUni):
        prob = dictiUni[allwords]/len(dictiUni)
        dictiUni[allwords] = prob
    
    return dictiUni
   

def buildBigramModel(Text):
    '''
    BUILD BIGRAM MODEL
    IS : Diberikan input sebuah data berisi text
    FS : Meng-outputkan hasil dari model bigram yang dibuat dalam bentuk dictionary (key: pasangan kata; value: probabilitas kemunculan pasangan kata tersebut)
    Note : Lakukan proses cleaning dengan menghapus punctuation dan mengubah teks menjadi lower case.
    '''
    words_low = Text.str.lower()
    words= words_low.str.split()
    punctuation = '!"#$%&\'’“”()*+,-./:;<=>?@[\\]^_`{|}~'
    table = str.maketrans('', '', punctuation)
    
    stripped = [None] * len(words)
    for i in range (len(words)):
        stripped[i] = [w.translate(table) for w in words[i]]
    
    cnt = Counter()
    for i in range (len(stripped)):
        for allword in stripped[i]:
            cnt[allword] += 1
    jumkata = dict(cnt)
    
    libigram = []
    for i in range (len(stripped)):
        libigram.append(('<s>',stripped[i][0]))
        for j in range (len(stripped[i])-1):
            libigram.append((stripped[i][j],stripped[i][j+1]))
    
    cnt2 = Counter()
    for allword in libigram:
        cnt2[allword] += 1
    
    dictibig = dict(cnt2)
    
    for allwords in dictibig:
        if allwords[0] == '<s>':
            prob = dictibig[allwords]/100
            dictibig[allwords] = prob
        else:
            prob = dictibig[allwords]/jumkata[allwords[0]]
            dictibig[allwords] = prob

    return dictibig


def nextBestWord(bigramModel, currentWord):
    '''
    MENAMPILKAN NEXT BEST WORD
    IS : Menerima input sebuah kata
    FS : Meng-outputkan kata berikutnya yang memiliki probabilitas tertinggi berdasarkan model bigram
    '''
    possible = []
    for word in (bigramModel):
        if word[0] == currentWord:
            possible.append([bigramModel[word],word[1]])
    nextWord = sorted(possible, reverse=True)
    return nextWord[0][1]
   


def nextTenBestWords(bigramModel, currentWord):
    '''
    MENYIMPAN TOP 10 NEXT BEST WORD
    IS : Menerima input sebuah kata
    FS : Menghasilkan list berisi 10 kata berikutnya (beserta probabilitasnya) dengan probabilitas tertinggi berdasarkan model bigram. 
    '''
    possible = []
    for word in (bigramModel):
        if word[0] == currentWord:
            possible.append([bigramModel[word],word[1]])
    nextWord = sorted(possible, reverse=True)[:10]
    return nextWord


def generateSentence(bigramModel, length):
    '''
    GENERATE SENTENCE
    IS : Menerima input model bigram dan panjang kalimat yang ingin di-generate
    FS : Mengembalikan kalimat dengan panjang sesuai inputan
    Note : Generate sentence
    '''
    kalimat = []
    threshold = 0.65 #Threshold digunakan agar tidak terjadi looping kata pada kalimat.
    for i in range (n):
        if i == 0:
            bestword = nextTenBestWords(bigramModel, '<s>')
        else:
            bestword = nextTenBestWords(bigramModel, kalimat[len(kalimat)-1])
        if np.random.uniform(0,1) <= threshold or len(bestword)==1:
            kalimat.append(bestword[0][1])
        else:
            ind = 0
            while ind==0:
                ind = np.random.randint(len(bestword))
            kalimat.append(bestword[ind][1])
    spasi = ' '
    return spasi.join(kalimat)

In [10]:
if __name__ == '__main__':
    print("TUGAS LANGUAGE MODELING NLP - SFY")
    print("SILAKAN MASUKKAN IDENTITAS ANDA")
    Nama = input("NAMA : ")
    NIM = input("NIM : ")

    os.system("pause")
    os.system("cls")

    #import dataset
    data = pd.read_csv('text.csv')

    print("TUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET")
    print()
    print("HASIL : ")
    print(data.head())

    os.system("pause")
    os.system("cls")

    print("TUGAS 2. BUAT MODEL UNIGRAM")
    print()
    print("HASIL : ")
    print(buildUnigramModel(data['text']))

    os.system("pause")
    os.system("cls")

    print("TUGAS 3. BUAT MODEL BIGRAM")
    print()
    print("HASIL : ")
    bigramModel = buildBigramModel(data['text'])
    print(bigramModel)    

    os.system("pause")
    os.system("cls")

    print("TUGAS 4. MENAMPILKAN NEXT BEST WORD")
    print()
    print("HASIL : ")
    print("of -> ",nextBestWord(bigramModel,"of"))
    print("update -> ",nextBestWord(bigramModel,"update"))
    print("hopes -> ",nextBestWord(bigramModel,"hopes"))

    os.system("pause")
    os.system("cls")

    print("TUGAS 5. TOP 10 BEST NEXT WORD")
    print()
    print("HASIL : ")
    print("of -> ",nextTenBestWords(bigramModel,"of"))
    print("update -> ",nextTenBestWords(bigramModel,"update"))
    print("hopes -> ",nextTenBestWords(bigramModel,"hopes"))

    os.system("pause")
    os.system("cls")

    print("TUGAS 6. GENERATE KALIMAT")
    print()
    n = int(input("Panjang Kalimat : "))
    print("HASIL : ")
    print(generateSentence(bigramModel, n))

    os.system("pause")
    os.system("cls")

    print("SELAMAT", Nama ,"ANDA SUDAH MENYELESAIKAN TUGAS LANGUAGE MODELING NLP-SFY")

TUGAS LANGUAGE MODELING NLP - SFY
SILAKAN MASUKKAN IDENTITAS ANDA
NAMA : Adrii
NIM : 131313
TUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET

HASIL : 
                                                text
0  Oh, how the headlines blared:\nChatbots were T...
1  If you’ve ever found yourself looking up the s...
2  Machine learning is increasingly moving from h...
3  If your understanding of A.I. and Machine Lear...
4  Want to learn about applied Artificial Intelli...
TUGAS 2. BUAT MODEL UNIGRAM

HASIL : 
TUGAS 3. BUAT MODEL BIGRAM

HASIL : 


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



TUGAS 5. TOP 10 BEST NEXT WORD

HASIL : 
of ->  [[0.19190575717271519, 'the'], [0.044651339540186206, 'a'], [0.022420672620178606, 'our'], [0.0222306669200076, 'this'], [0.016910507315219456, 'data'], [0.01520045601368041, 'these'], [0.011400342010260307, 'machine'], [0.008170245107353221, 'what'], [0.008170245107353221, 'each'], [0.007980239407182215, 'ai']]
update ->  [[0.24444444444444444, 'this'], [0.13333333333333333, 'the'], [0.1111111111111111, 'our'], [0.06666666666666667, 'its'], [0.06666666666666667, '2'], [0.06666666666666667, '1'], [0.044444444444444446, 'it'], [0.044444444444444446, '492017'], [0.022222222222222223, 'those'], [0.022222222222222223, 'rules']]
hopes ->  [[0.5, 'were'], [0.25, 'to'], [0.25, 'of']]
TUGAS 6. GENERATE KALIMAT

Panjang Kalimat : 40
HASIL : 
machine over the same time to the data science of what we can be able to the model is a lot of the same time to the same time is a lot smarter it will be a lot of the
SELAMAT Adrii ANDA SUDAH MENYELESAIKAN TUG