# Add-K+Interpolation 

In [37]:
from math import log
from n_gram import *
import csv

In [35]:
# construct positive model
txt = "pos_pre.txt"
with open('%s' % txt,'r') as file:
     file_str = file.read()
file_str = file_str.replace('\n',' ')

length_pos,uni_word_pos,uni_cnt_pos,uni_prob_pos, \
bi_word_pos,bi_cnt_pos,bi_prob_pos,tri_word,tri_cnt_pos,tri_prob= gram(file_str)

# construct negative model
txt = "neg_pre.txt"
with open('%s' % txt,'r') as file:
     file_str = file.read()
file_str = file_str.replace('\n',' ')

length_neg,uni_word_neg,uni_cnt_neg,uni_prob_neg, \
bi_word_neg,bi_cnt_neg,bi_prob_neg,tri_word,tri_cnt_neg,tri_prob= gram(file_str)

In [39]:
"""
Functions to calculate addK probability to unigram,bigram and trigram.
""" 
def calc_addK_tri(k,pre_word,pre_word2,word,tri_cnt,bi_cnt,uni_cnt):
    tuple = (pre_word,pre_word2,word)
    if tuple in tri_cnt:
        tri_prob = tri_cnt[tuple]
    else: tri_prob = 0
    if (pre_word,pre_word2) in bi_cnt:
        bi_prob = bi_cnt[(pre_word,pre_word2)]
    else: bi_prob = 0
    return (tri_prob+k)/(bi_prob+k*len(uni_cnt.keys()))

def calc_addK_bi(k,pre_word,word,bi_cnt,uni_cnt):
    tuple = (pre_word,word)
    if tuple in bi_cnt:
        bi_prob = bi_cnt[tuple]
    else: bi_prob = 0
    if pre_word in uni_cnt:
        uni_prob = uni_cnt[pre_word]
    else: uni_prob = 0
    return (bi_prob+k)/(uni_prob+k*len(uni_cnt.keys()))

def calc_addK_uni(k,word,length,uni_cnt):
    if word in uni_cnt:
        uni_prob = uni_cnt[word]
    else: uni_prob = 0
    return (uni_prob+k)/(length+k*len(uni_cnt.keys()))

In [41]:
"""
Tune the parameters
"""
for weight1 in [0.01,0.03,0.05]:
    for weight2 in [0.03,0.05,0.07]:
        weight3 = 1-weight1-weight2
        for k1 in [1.0,1.5,2.0]:
            for k2 in [0.8,1.5,2.0]:
                for k3 in [0.5,1.0,1.5,2.0]:
                    txt = "pos_dev_pre.txt"
                    with open('%s' % txt,'r') as file:
                         lines = file.readlines()
                    correct_cnt = 0
                    total_cnt = 0
                    for line in lines:
                        pos_prob = 0
                        neg_prob = 0
                        line = line.replace('/n','')
                        sentence = line.split()
                        pre_word = sentence[0]
                        pre_word2 = sentence[1]
                        
                        p_prob_uni = calc_addK_uni(k1,pre_word2,length_pos,uni_cnt_pos)   
                        p_prob_bi = calc_addK_bi(k2,pre_word,pre_word2,bi_cnt_pos,uni_cnt_pos)  
                        n_prob_uni = calc_addK_uni(k1,pre_word2,length_neg,uni_cnt_neg)   
                        n_prob_bi = calc_addK_bi(k2,pre_word,pre_word2,bi_cnt_neg,uni_cnt_neg) 
                        p_prob = (1-weight1)*p_prob_uni+weight1*p_prob_bi
                        n_prob = (1-weight1)*n_prob_uni+weight1*n_prob_bi
                        pos_prob += log(p_prob)
                        neg_prob += log(n_prob)
                        for i in range(2,len(sentence)):
                            word = sentence[i]
                            pre_word = sentence[i-2]
                            pre_word2 = sentence[i-1]

                            # Caculate the probability
                            p_prob_uni = calc_addK_uni(k1,word,length_pos,uni_cnt_pos)   
                            p_prob_bi = calc_addK_bi(k2,pre_word2,word,bi_cnt_pos,uni_cnt_pos)      
                            p_prob_tri = calc_addK_tri(k3,pre_word,pre_word2,word,tri_cnt_pos,bi_cnt_pos,uni_cnt_pos)             

                            n_prob_uni = calc_addK_uni(k1,word,length_neg,uni_cnt_neg)   
                            n_prob_bi = calc_addK_bi(k2,pre_word2,word,bi_cnt_neg,uni_cnt_neg)             
                            n_prob_tri = calc_addK_tri(k3,pre_word,pre_word2,word,tri_cnt_neg,bi_cnt_neg,uni_cnt_neg)             


                            #Interpolation
                            p_prob = weight3*p_prob_uni+weight1*p_prob_bi+weight2*p_prob_tri
                            n_prob = weight3*n_prob_uni+weight1*n_prob_bi+weight2*n_prob_tri
                            pos_prob += log(p_prob)
                            neg_prob += log(n_prob)
                        if neg_prob<pos_prob: 
                            correct_cnt += 1
                        total_cnt += 1
                    
                    txt = "neg_dev_pre.txt"
                    # read the file
                    with open('%s' % txt,'r') as file:
                         lines = file.readlines()

                    for line in lines:
                        pos_prob = 0
                        neg_prob = 0
                        line = line.replace('/n','')
                        sentence = line.split()
                        pre_word = sentence[0]
                        pre_word2 = sentence[1]
                        p_prob_uni = calc_addK_uni(k1,pre_word2,length_pos,uni_cnt_pos)   
                        p_prob_bi = calc_addK_bi(k2,pre_word,pre_word2,bi_cnt_pos,uni_cnt_pos)  
                        n_prob_uni = calc_addK_uni(k1,pre_word2,length_neg,uni_cnt_neg)   
                        n_prob_bi = calc_addK_bi(k2,pre_word,pre_word2,bi_cnt_neg,uni_cnt_neg) 
                        p_prob = (1-weight1)*p_prob_uni+weight1*p_prob_bi
                        n_prob = (1-weight1)*n_prob_uni+weight1*n_prob_bi
                        pos_prob += log(p_prob)
                        neg_prob += log(n_prob)
                        for i in range(2,len(sentence)):
                            word = sentence[i]
                            pre_word = sentence[i-2]
                            pre_word2 = sentence[i-1]
                            
                            # Caculate the probability
                            p_prob_uni = calc_addK_uni(k1,word,length_pos,uni_cnt_pos)   
                            p_prob_bi = calc_addK_bi(k2,pre_word2,word,bi_cnt_pos,uni_cnt_pos)      
                            p_prob_tri = calc_addK_tri(k3,pre_word,pre_word2,word,tri_cnt_pos,bi_cnt_pos,uni_cnt_pos)             

                            n_prob_uni = calc_addK_uni(k1,word,length_neg,uni_cnt_neg)   
                            n_prob_bi = calc_addK_bi(k2,pre_word2,word,bi_cnt_neg,uni_cnt_neg)             
                            n_prob_tri = calc_addK_tri(k3,pre_word,pre_word2,word,tri_cnt_neg,bi_cnt_neg,uni_cnt_neg)             

                            # Interpolation
                            p_prob = weight3*p_prob_uni+weight1*p_prob_bi+weight2*p_prob_tri
                            n_prob = weight3*n_prob_uni+weight1*n_prob_bi+weight2*n_prob_tri
                            pos_prob += log(p_prob)
                            neg_prob += log(n_prob)
                        if neg_prob>pos_prob: 
                            correct_cnt += 1
                        total_cnt += 1
                    print (str(weight1)+" "+str(weight2)+" "+str(k1)+str(k2)+str(k3)+ "   "+str(total_cnt)+"   "+str(float(correct_cnt)/total_cnt))

KeyboardInterrupt: 

In [11]:
"""
Combine dev and train to new training data
"""

# construct positive model
txt = "pos_pre.txt"
with open('%s' % txt,'r') as file:
     file_str = file.read()
file_str = file_str.replace('\n',' ')

txt = "pos_dev_pre.txt"
with open('%s' % txt,'r') as file:
     file_str_2 = file.read()
file_str_2 = file_str_2.replace('\n',' ')
file_str = file_str+file_str_2
length_pos,uni_word_pos,uni_cnt_pos,uni_prob,\
bi_word_pos,bi_cnt_pos,bi_prob_pos,tri_word,tri_cnt_pos,tri_prob= gram(file_str)

# construct negative model
txt = "neg_pre.txt"
with open('%s' % txt,'r') as file:
     file_str = file.read()
file_str = file_str.replace('\n',' ')

txt = "neg_dev_pre.txt"
with open('%s' % txt,'r') as file:
     file_str_2 = file.read()
file_str_2 = file_str_2.replace('\n',' ')
file_str = file_str+file_str_2

length_neg,uni_word_neg,uni_cnt_neg,uni_prob,\
bi_word_neg,bi_cnt_neg,bi_prob_neg,tri_word,tri_cnt_neg,tri_prob= gram(file_str)

In [17]:
"""
Do the classification for test data
"""

txt = "test_pre.txt"
with open('%s' % txt,'r') as file:
     lines = file.readlines()
total_cnt = len(lines)
rows = []
count = 0
weight1 = 0.01
weight2 = 0.05
weight3 = 1-weight1-weight2
k1 = 1.5
k2 = 0.8
k3 = 1.5

for line in lines:
    pos_prob = 0
    neg_prob = 0
    line = line.replace('/n','')
    sentence = line.split()
    pre_word = sentence[0]
    pre_word2 = sentence[1]
    p_prob_uni = calc_addK_uni(k1,pre_word2,length_pos,uni_cnt_pos)   
    p_prob_bi = calc_addK_bi(k2,pre_word,pre_word2,bi_cnt_pos,uni_cnt_pos)  
    n_prob_uni = calc_addK_uni(k1,pre_word2,length_neg,uni_cnt_neg)   
    n_prob_bi = calc_addK_bi(k2,pre_word,pre_word2,bi_cnt_neg,uni_cnt_neg) 
    p_prob = (1-weight1)*p_prob_uni+weight1*p_prob_bi
    n_prob = (1-weight1)*n_prob_uni+weight1*n_prob_bi
    pos_prob += log(p_prob)
    neg_prob += log(n_prob)
    for i in range(2,len(sentence)):
        word = sentence[i]
        pre_word = sentence[i-2]
        pre_word2 = sentence[i-1]

        p_prob_uni = calc_addK_uni(k1,word,length_pos,uni_cnt_pos)   
        p_prob_bi = calc_addK_bi(k2,pre_word2,word,bi_cnt_pos,uni_cnt_pos)      
        p_prob_tri = calc_addK_tri(k3,pre_word,pre_word2,word,tri_cnt_pos,bi_cnt_pos,uni_cnt_pos)             


        n_prob_uni = calc_addK_uni(k1,word,length_neg,uni_cnt_neg)   
        n_prob_bi = calc_addK_bi(k2,pre_word2,word,bi_cnt_neg,uni_cnt_neg)             
        n_prob_tri = calc_addK_tri(k3,pre_word,pre_word2,word,tri_cnt_neg,bi_cnt_neg,uni_cnt_neg)             


        p_prob = weight3*p_prob_uni+weight1*p_prob_bi+weight2*p_prob_tri
        n_prob = weight3*n_prob_uni+weight1*n_prob_bi+weight2*n_prob_tri
        pos_prob += log(p_prob)
        neg_prob += log(n_prob) 
    if pos_prob>neg_prob:
        rows.append((count+1,0))
    else: 
        rows.append((count+1,1))
    count+=1
headers = ['ID','Prediction']


with open('submission_addK_interpolation.csv','w') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)    

0
0
1
0
0
1
1
1
0
0
0
0
0
1
0
0
0
1
0
0
0
1
1
0
1
0
0
0
0
1
1
1
0
1
1
1
1
1
1
1
1
0
1
0
0
1
0
0
1
1
0
1
0
0
0
0
0
0
1
1
0
0
1
0
1
0
0
0
1
0
0
1
0
0
1
0
1
0
0
0
1
1
0
0
0
0
0
0
0
1
0
0
1
1
0
1
0
0
1
0
1
0
1
1
1
0
0
0
0
0
0
0
0
1
0
1
1
1
0
0
0
0
0
0
1
1
0
1
1
1
0
0
1
0
0
1
1
0
1
1
0
1
1
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
1
1
1
0
1
0
0
0
1
1
0
0
1
1
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
1
0
0
0
1
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
1
1
0
1
1
1
0
1
1
0
1
1
1
1
0
1
1
1
1
1
1
1
1
0
1
0
1
1
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
1
1
0
0
0
1
1
1
1
1
1
1
0
1
0
1
0
0
0
0
1
0
1
0
0
0
0
1
0
1
1
1
0
0
0
0
1
1
1
0
1
0
1
0
0
0
1
1
0
0
0
1
1
1
1
1
1
1
1
0
1
0
1
1
1
0
0
1
0
1
0
1
1
0
0
0
0
1
1
0
1
0
0
1
0
1
0
1
0
1
1
1
1
1
1
1
0
1
0
1
0
0
1
1
1
0
1
0
1
1
0
0
0
0
1
1
0
1
0
0
0
1
1
0
0
1
1
0
1
0
1
0
0
0
1
1
0
1
1
1
1
1
1
1
0
1
0
0
0
0
0
0
0
0
1
1
0
0
1
1
0
1
1
0
1
1
1
0
1
0
0
1
1
0
1
0
0
0
1
1
0
0
1
0
