In [1]:
def get_parameter_vectors():
    '''
    This function parses e.txt and s.txt to get the  26-dimensional multinomial
    parameter vector (characters probabilities of English and Spanish) as
    described in section 1.2 of the writeup

    Returns: tuple of vectors e and s
    '''
    #Implementing vectors e,s as lists (arrays) of length 26
    #with p[0] being the probability of 'A' and so on
    e=[0]*26
    s=[0]*26

    with open('e.txt',encoding='utf-8') as f:
        for line in f:
            #strip: removes the newline character
            #split: split the string on space character
            char,prob=line.strip().split(" ")
            #ord('E') gives the ASCII (integer) value of character 'E'
            #we then subtract it from 'A' to give array index
            #This way 'A' gets index 0 and 'Z' gets index 25.
            e[ord(char)-ord('A')]=float(prob)
    f.close()

    with open('s.txt',encoding='utf-8') as f:
        for line in f:
            char,prob=line.strip().split(" ")
            s[ord(char)-ord('A')]=float(prob)
    f.close()

    return (e,s)

In [2]:
e,s = get_parameter_vectors()

In [3]:
e

[0.0834417,
 0.0154077,
 0.0273137,
 0.0414207,
 0.126063,
 0.0203102,
 0.0192096,
 0.0611306,
 0.0671336,
 0.00230115,
 0.00870435,
 0.0424212,
 0.0253127,
 0.068034,
 0.0770385,
 0.0166083,
 0.00090045,
 0.0568284,
 0.0611306,
 0.0937469,
 0.0285143,
 0.0106053,
 0.0234117,
 0.002001,
 0.0204102,
 0.0006003]

In [4]:
s

[0.121649,
 0.014906,
 0.0387155,
 0.0467187,
 0.140856,
 0.00690276,
 0.010004,
 0.0118047,
 0.0598239,
 0.00520208,
 0.00110044,
 0.052421,
 0.0308123,
 0.070028,
 0.0920368,
 0.0289116,
 0.0111044,
 0.0641257,
 0.0720288,
 0.0460184,
 0.0469188,
 0.0105042,
 0.00040016,
 0.00140056,
 0.0109044,
 0.00470188]

In [73]:
file = 'samples/letter1.txt';

In [74]:
def shred(filename):
    #Using a dictionary here. You may change this to any data structure of
    #your choice such as lists (X=[]) etc. for the assignment
    letter_count_dictionary = dict()
    with open (filename, encoding='utf-8') as f:
        for line in f:
            for character in line:
                uppercase_char = character.upper()
                ascii_val = ord(uppercase_char)
                if 90 >= ascii_val >= 65:
                    if uppercase_char in letter_dictionary.keys():
                        letter_count_dictionary[uppercase_char] += 1
                    else:
                        letter_count_dictionary[uppercase_char] = 1
    return letter_count_dictionary

# Shred file -- counts the number of characters in each file
# Store it as dictionary

In [75]:
shred(file)

{'D': 13,
 'O': 11,
 'S': 12,
 'R': 4,
 'U': 13,
 'G': 7,
 'I': 8,
 'T': 1,
 'A': 25,
 'E': 12,
 'N': 17,
 'M': 9,
 'P': 1,
 'C': 5,
 'H': 2,
 'Y': 4,
 'L': 2,
 'B': 5,
 'V': 2,
 'Q': 2}

In [76]:
def get_parameter_vectors():
    '''
    This function parses e.txt and s.txt to get the  26-dimensional multinomial
    parameter vector (characters probabilities of English and Spanish) as
    described in section 1.2 of the writeup

    Returns: tuple of vectors e and s
    '''
    #Implementing vectors e,s as lists (arrays) of length 26
    #with p[0] being the probability of 'A' and so on
    e=[0]*26
    s=[0]*26

    with open('e.txt',encoding='utf-8') as f:
        for line in f:
            #strip: removes the newline character
            #split: split the string on space character
            char,prob=line.strip().split(" ")
            #ord('E') gives the ASCII (integer) value of character 'E'
            #we then subtract it from 'A' to give array index
            #This way 'A' gets index 0 and 'Z' gets index 25.
            e[ord(char)-ord('A')]=float(prob)
    f.close()

    with open('s.txt',encoding='utf-8') as f:
        for line in f:
            char,prob=line.strip().split(" ")
            s[ord(char)-ord('A')]=float(prob)
    f.close()

    return (e,s)

In [105]:
import math
def x_log_lang(letter, file):
    letter_to_index = {
        'A': 0,
        'B': 1,
        'C': 2,
        'D': 3,
        'E': 4,
        'F': 5,
        'G': 6,
        'H': 7,
        'I': 8,
        'J': 9,
        'K': 10,
        'L': 11,
        'M': 12,
        'N': 13,
        '0': 14,
        'P': 15,
        'Q': 16,
        'R': 17,
        'S': 18,
        'T': 19,
        'U': 20,
        'V': 21,
        'W': 22,
        'X': 23,
        'Y': 24,
        'Z': 25
    }

    # Get parameters returns a list, not a dictionary
    letter_count_dictionary = shred(file)
    x_english_list, x_spanish_list = get_parameter_vectors()
    list_index = letter_to_index[letter]

    if letter in letter_count_dictionary.keys():
        # X_1 * log(e_1)
        english_prob = round(letter_count_dictionary[letter] * math.log(x_english_list[list_index]), 4)
        # X_1 * log(s_1)
        spanish_prob = round(letter_count_dictionary[letter] * math.log(x_spanish_list[list_index]), 4)
        return english_prob, spanish_prob

    else:
        return 0.0000, 0.0000

In [106]:
x_log_lang('C', 'samples/letter1.txt')

(-18.0018, -16.2576)

In [118]:
def function_y(language, file):
    all_letters = [
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', '0', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
    ]
    sum_letter_probs = 0
    if language == 'english':
        for letter in all_letters:
            english_prob, _ = x_log_lang(letter, file)
            sum_letter_probs += english_prob
        p_english = 0.6
        return round(math.log(p_english) + sum_letter_probs, 4)

    elif language == 'spanish':
        for letter in all_letters:
            _, spanish_prob = x_log_lang(letter, file)
            sum_letter_probs += spanish_prob
        p_spanish = 0.4
        return round(math.log(p_spanish) + sum_letter_probs, 4)

In [119]:
function_y('spanish', 'samples/letter1.txt')

-425.4119

In [122]:
def p_lang_given_x(file):
    f_spanish = function_y('spanish', file)
    f_english = function_y('english', file)

    if f_spanish - f_english >= 100:
        p_english_given_x = 0
    elif f_spanish - f_english <= -100:
        p_english_given_x = 1
    else:
        p_english_given_x = 1/(1 + math.exp(f_spanish - f_english))
    return round(p_english_given_x, 4)

In [132]:
fileName = 'letter.txt'
fileName = 'samples/letter1.txt'
p_lang_given_x(fileName)

0.0

In [134]:
fileName = 'samples/letter0.txt'

# Q1
all_letters = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', '0', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
    'W', 'X', 'Y', 'Z'
]
print('Q1')
letter_count_dictionary = shred(fileName)
for letter in all_letters:
    if letter in letter_count_dictionary.keys():
        count = letter_count_dictionary[letter]
    else:
        count = 0
    print(letter, ' ', count)

Q1/n
A   1
B   0
C   0
D   0
E   0
F   0
G   0
H   0
I   0
J   0
K   0
L   0
M   0
N   0
0   0
P   0
Q   0
R   0
S   0
T   0
U   0
V   0
W   0
X   0
Y   0
Z   0


In [140]:
# Q2
question_letter = 'A'
output = x_log_lang(question_letter, fileName)
print(round(output[0], 4))
print(round(output[1], 4))

-2.4836
-2.1066


In [141]:
# Q3
print('Q3')
f_english = function_y('english', fileName)
print(f_english)
f_spanish = function_y('spanish', fileName)
print(f_spanish)

Q3
-2.9944
-3.0229


In [142]:
# Q4
print('Q4')
print(p_lang_given_x(fileName))

Q4
0.5071


In [1]:
print(1, 2)

1 2


In [3]:
print(-0.0000)

-0.0
