In [137]:
from _collections import defaultdict
from functools import reduce
import re
import math
import random


In [138]:
UNK_TAG = '<UNK>'
DIGIT_TAG = '<NUM>'
WEB_TAG = '<WEB>'
LANDAS =[0.15966386554621848, 0.008403361344537815, 0.8319327731092437]
MOST_COMMON = {
    "the", "be", "to", "of", "and", "a", "that", "have", "i"
                , "it", "for", "not", "on", "he", "as", "you", "do", "at", "this", "but"
                , "his", "by", "from", "they", "we", "her", "she", "or", "an", "will", "my", "would"
                , "all", "me", "when", "no", "just", "him","e" "<num>" , "d", 
    UNK_TAG , DIGIT_TAG , WEB_TAG, ".","<num>"}


In [139]:
def get_data(path):
    with open(path, '+r') as file:
        lines = format_data(file.readlines())
        return lines


def find_unk(lines):
    curpus = ' ////// '.join(lines)
    words = set(curpus.split())
    for word in words:
        if curpus.count(word) == 1:
            curpus = curpus.replace(word, UNK_TAG)
    lines = curpus.split(' ////// ')
    # print(lines)
    return lines

In [169]:
def format_data(lines):
    formatted_lines = list(map(
        lambda x: re.sub('https?\s*:\s*/\s*/\s*[^\s<>"]+|www\.[^\s<>"]', WEB_TAG,
                         x), lines))
    formatted_lines = list(map(lambda x: re.sub("[^a-zA-Z.0-9$]", ' ', x), formatted_lines))
    # formatted_lines = list(map(lambda x: re.sub("\s'", "'", x), formatted_lines))
    formatted_lines = list(map(lambda x: re.sub('\d{1,}', f' {DIGIT_TAG} ', x), formatted_lines))
    formatted_lines = [' '.join(line.strip().lower().split()) for line in formatted_lines]
    formatted_lines = ['<s> ' * 2 + line + ' <s/>' for line in formatted_lines]
    return formatted_lines


def generate_n_gram(n, lines):
    grams = []
    for sentence in lines:
        words = sentence.split()
        for i in range(len(words) - n):
            gram = words[i:i + n]
            if gram not in grams:
                grams.append(gram)
    return grams



In [170]:
def learn(lines):
    unigrams = defaultdict(lambda: 0)
    bigrams = defaultdict(lambda: defaultdict(lambda: 0))
    trigrams = defaultdict(lambda: defaultdict(lambda: 0))
    text = ' '.join(lines)
    s = len(text)
    # print('wrongly' in text)
    p_unigrams = defaultdict(lambda: 1 / s)
    p_bigrams = defaultdict(lambda: defaultdict(lambda: 1 / s))
    p_trigrams = defaultdict(lambda: defaultdict(lambda: 1 / s))
    words = set()
    for k in generate_n_gram(1, lines):
        unigrams[k[0]] += text.count(k[0])
        p_unigrams[k[0]] = unigrams[k[0]] / s
        words.add(k[0])
        assert (p_unigrams[k[0]] <= 1)
        # if unigrams[k[0]] == 1:
        #     unigrams[UNK_TAG] += 1\
    print('unigrams computed')
    for k in generate_n_gram(2, lines):
        k.__contains__('panic')
        tmp = ' '.join(k)
        bigrams[k[0]][k[1]] += text.count(tmp)
        p_bigrams[k[0]][k[1]] = bigrams[k[0]][k[1]] / unigrams[k[0]]
        assert (p_bigrams[k[0]][k[1]] <= 1)

    print('bigrams computed')
    for k in generate_n_gram(3, lines):
        tmp = ' '.join(k)
        trigrams[(k[0], k[1])][k[2]] += text.count(tmp)
        p_trigrams[(k[0], k[1])][k[2]] = trigrams[(k[0], k[1])][k[2]] / bigrams[k[0]][k[1]]
        assert (p_trigrams[(k[0], k[1])][k[2]] <= 1)
    print('trigrams computed')
    return {
        'trigram': p_trigrams,
        'bigram': p_bigrams,
        'unigram': p_unigrams,
        'words': words,
        'size': s
    }


In [171]:
def read_test(pathes):
    with open(pathes[0], '+r') as file:
        lines = format_data(file.readlines())
    with open(pathes[1], '+r') as file:
        labels = list(map(lambda x: re.sub('[^a-zA-Z.$]', ' ', x), file.readlines()))
        labels = [' '.join(label.strip().lower().split()) for label in labels]
    return lines, labels


In [172]:
def predict(words, model, real, some_random_words, landa):
    unigram, bigram, trigram = model['unigram'], model['bigram'], model['trigram']
    predicts = set(
        list(trigram[(words[0], words[1])].keys()) + list(bigram[words[1]].keys()) + some_random_words).difference(
        MOST_COMMON)
    max_prob = -1
    best_pr = ''
#     print(f'predict set has it : {real in predicts}\n')
#     print(f'prediction set : {len(predicts)}\n')
    for pr in predicts:
        prob = get_prob(unigram, bigram, trigram, pr, words, landa)
#         print(f'{pr} : {prob}')
        if prob > max_prob:
            max_prob = prob
            best_pr = pr

    return best_pr


In [173]:
def get_random_landas(seed):
    random.seed(seed)
    arr = [random.randint(1, 100) for _ in range(3)]
    s = sum(arr)
    arr = list(map(lambda x: x / s, arr))
    return arr

In [174]:
def back_off(unigram, bigram, trigram, words, landa):
    return landa[0] * trigram[(words[0], words[1])][words[2]] + landa[1] * bigram[words[1]][words[2]] + landa[2] * \
           unigram[words[2]]


def get_prob(unigram, bigram, trigram, pr, words, landa):
    tmp = [[words[0], words[1], pr], [words[1], pr, words[2]], [pr, words[2], words[3]]]
    prob = reduce(lambda x, y: x * y, [back_off(unigram, bigram, trigram, t, landa) for t in tmp])
    return prob



In [175]:
def get_random_words(unigram, size):
    m = min(unigram.values())
    random_words = list(filter(lambda x: unigram[x] <= m, unigram.keys()))

    def select_random(seed):
        random.seed(seed)
        words = random.choices(random_words, k=5)
        return words

    return select_random


def cost_function(y_hat, y):
    return math.fabs(y_hat ** 2 - y ** 2)

In [176]:
def test(lines, labels, model, landa):
    counter = 0
    cost = 0
    random_selector = get_random_words(model['unigram'], model['size'])
    print('testing started')
    for i in range(len(lines)):
        l = lines[i].split()
        j = l.index('$')
        words = l[j - 2:j] + l[j + 1:j + 3]
        piece = list(filter(lambda x: UNK_TAG if x not in model['words'] else x, words))
#         print('--------------------------------\n')
#         print(f'sentence : {i}\n')
#         print(piece)
        s = predict(piece, model, labels[i], random_selector(i), landa)
        if s == labels[i]:
            counter += 1
            print(
                f"success : predicted : {s}:{model['unigram'][s]} , real : {labels[i]} :{model['unigram'][labels[i]]}\n")
        else:
            pass
#             print(
#                 f"failure : predicted : {s}:{model['unigram'][s]} , real : {labels[i]}: {model['unigram'][labels[i]]}\n")
#         print(f"trained data has it: {labels[i] in model['words']}")
        cost += cost_function(get_prob(model['unigram'], model['bigram'], model['trigram'], s, words, landa),
                              get_prob(model['unigram'], model['bigram'], model['trigram'], labels[i], words, landa))

    print(f'total cost: {cost} count: {counter}')
    return counter, cost


In [177]:
def learn_landas(lines_test, labels, model, num_iteration=5):
    all_landas = dict()
    for i in range(num_iteration):
        landa = get_random_landas(i)
        count, cost = test(lines_test, labels, model, landa)
        try:
            if all_landas[count][0] > cost:
                all_landas[count] = [cost, landa]
        except:
            all_landas[count] = [cost, landa]

    print(f'min cost : {max(all_landas.keys())}')
    return all_landas[max(all_landas.keys())]

In [178]:
lines_train = get_data('Train_data.rtf')


In [179]:
model = learn(lines_train)


unigrams computed
bigrams computed
trigrams computed


In [180]:
lines_test, labels = read_test(['Test_data.rtf', 'labels.rtf'])

In [182]:
print(test(lines_test, labels, model , LANDAS))


testing started
success : predicted : disease:3.696538807496581e-05 , real : disease :3.696538807496581e-05

success : predicted : improved:1.2321796024988602e-05 , real : improved :1.2321796024988602e-05

success : predicted : high:0.00019714873639981763 , real : high :0.00019714873639981763

success : predicted : decided:1.8482694037482904e-05 , real : decided :1.8482694037482904e-05

success : predicted : product:9.241347018741451e-05 , real : product :9.241347018741451e-05

success : predicted : about:0.00029572310459972646 , real : about :0.00029572310459972646

success : predicted : with:0.0011336052342989513 , real : with :0.0011336052342989513

success : predicted : statement:9.857436819990882e-05 , real : statement :9.857436819990882e-05

success : predicted : example:1.8482694037482904e-05 , real : example :1.8482694037482904e-05

success : predicted : style:4.3126286087460105e-05 , real : style :4.3126286087460105e-05

success : predicted : efforts:3.0804490062471504e-05 , r

In [68]:
{'1' , '2' , '3'}.difference({'1','2'})

{'3'}

In [None]:
def j_cost(words , landas , unigram , bigram , trigram):
    return math.log(back_off(unigram , bigram , trigram , words , landas))/3

def gradient()