# Predictions

In [1]:
import re

def extract_data(log_file_path):
    with open(log_file_path, 'r') as file:
        lines = file.readlines()

    references = []
    hypotheses = []

    # Process each line to extract only the part after ":\t"
    for i in range(len(lines) - 1):
        if 'Text Reference' in lines[i]:
            reference_part = lines[i].split(':\t', 1)[1].strip()
            hypothesis_part = lines[i + 1].split(':\t', 1)[1].strip()
            references.append(reference_part)
            hypotheses.append(hypothesis_part)

    return references, hypotheses

In [2]:
def clean_text(text):
    """ Removes all non-alphanumeric characters from text, except spaces. """
    return re.sub(r'[^a-zA-Z0-9\s]', '', text).strip()

In [3]:
def analyze_predictions(references, hypotheses):
    exact_correct_predictions = {}
    loose_correct_predictions = {}
    incorrect_predictions = {}
    sen_length = []

    for reference, hypothesis in zip(references, hypotheses):
        ref_words = reference.split()
        hyp_words = hypothesis.split()
        hyp_words_set = set(hyp_words)  # Using a set for fast membership checking
        correct = False

        for word in ref_words:
            if word in hyp_words:
                loose_correct_predictions[word] = loose_correct_predictions.get(word, 0) + 1
                correct = True
            if word not in hyp_words_set:
                incorrect_predictions[word] = incorrect_predictions.get(word, 0) + 1

        # Analyzing exact word positions
        min_length = min(len(ref_words), len(hyp_words))
        for i in range(min_length):
            if ref_words[i] == hyp_words[i]:
                exact_correct_predictions[ref_words[i]] = exact_correct_predictions.get(ref_words[i], 0) + 1
                correct = True
                
        if correct:
            temp = clean_text(reference).split()
            sen_length.append(len(temp))

    return exact_correct_predictions, loose_correct_predictions, incorrect_predictions, sen_length

In [4]:
log_file_path = '2d_kd_3.5,0.5.log'  
references, hypotheses = extract_data(log_file_path)
exact_correct_predictions, loose_correct_predictions, incorrect_predictions, length = analyze_predictions(references, hypotheses)

In [5]:
print(len(references), len(hypotheses))

135 135


In [6]:
references

['*** i   am sorry.',
 '*** *** good morning!',
 'you are so slow!',
 'i   am  not tired.',
 '*** i   am  heartbroken.',
 'i like you very  much.',
 '* ** my dog   died.',
 'i am ** ***** scared.',
 '* is it new   year?',
 'i am ** ***** worried.',
 'this is not  hard.',
 'my head is   painful.',
 'i am   sorry.',
 'i am   tired.',
 'how old are  you?',
 'i am ** ***** scared.',
 '* good morning!',
 'i am fine.',
 'my dog died.',
 'you are disgusting!',
 'i ** like you.',
 'this is very hard.',
 'i am tired.',
 'i do not like you.',
 '* john likes mary.',
 'i saw a  ghost.',
 'i am shocked!',
 'you are disgusting!',
 'how old are you?',
 'i am worried.',
 'i am so tired.',
 'i am alone.',
 'i am ** ***** sorry.',
 'i am shocked!',
 'the trip is exciting.',
 'you are so slow!',
 'i hate you!',
 'i like you very much.',
 'this is hard.',
 'does john like mary?',
 'i am sorry.',
 'how are you?',
 'you are slow.',
 'you are sick.',
 'my dog died.',
 '* ** good morning!',
 'i saw a   ghost.

In [7]:
hypotheses

['you are so slow!',
 'you are not  slow.',
 'you are so slow!',
 'you are not slow.',
 'you are not slow.',
 'i am   12  years old.',
 'i am 12 years old.',
 'i am 12 years old.',
 'i am 12 years old.',
 'i am 12 years old.',
 '**** i  like you.',
 '** i    like you.',
 'i like you.',
 'i like you.',
 '*** i   like you.',
 'i am 12 years old.',
 'i am   fine.',
 'i am fine.',
 'i  am  alone.',
 'i   am  alone.',
 'i am so   tired.',
 '**** i  am   shocked!',
 'i am alone.',
 'i ** *** am   alone.',
 'i am   so    tired.',
 'i *** am fine.',
 'i am fine.',
 'i   am  heartbroken.',
 '*** i   am  fine.',
 'i am fine.',
 'i am ** heartbroken.',
 'i am shocked!',
 'i am 12 years old.',
 'i am shocked!',
 '*** i    am shocked!',
 '*** i   am alone.',
 'i am   alone.',
 'i **** *** am   alone.',
 'i    am alone.',
 '**** i    am   alone.',
 'i am fine.',
 'i   am  nervous.',
 'i   am  nervous.',
 'i   am  nervous.',
 'i  am  fine.',
 'i am not  tired.',
 'i am  not tired.',
 'i am    heartbr

In [8]:
print("Exact Correct Predictions:")
for word, count in exact_correct_predictions.items():
    print(f"{word}: {count}")

Exact Correct Predictions:
you: 1
are: 1
so: 1
slow!: 1
not: 1
i: 60
am: 39
fine.: 1
shocked!: 1
proud: 1
of: 1
you!: 1
worried.: 1
12: 1
years: 1
old.: 1


In [9]:
print("Loose Correct Predictions (word found anywhere in the sentence):")
for word, count in loose_correct_predictions.items():
    print(f"{word}: {count}")

Loose Correct Predictions (word found anywhere in the sentence):
you: 1
are: 1
so: 1
slow!: 1
not: 1
i: 60
am: 39
fine.: 1
shocked!: 1
proud: 1
of: 1
you!: 1
worried.: 1
12: 1
years: 1
old.: 1


In [10]:
print("\nIncorrect Predictions:")
for word, count in incorrect_predictions.items():
    print(f"{word}: {count}")


Incorrect Predictions:
***: 4
i: 3
am: 5
sorry.: 6
good: 8
morning!: 5
tired.: 10
heartbroken.: 1
like: 11
you: 27
very: 7
much.: 3
*: 35
**: 43
my: 8
dog: 4
died.: 4
*****: 14
scared.: 5
is: 15
it: 1
new: 5
year?: 1
worried.: 3
this: 7
not: 11
hard.: 7
head: 4
painful.: 4
how: 8
old: 6
are: 32
you?: 8
disgusting!: 7
you.: 8
do: 2
john: 5
likes: 3
mary.: 3
saw: 4
a: 4
ghost.: 4
shocked!: 3
so: 8
alone.: 3
the: 5
trip: 2
exciting.: 2
slow!: 4
hate: 4
you!: 6
does: 2
mary?: 2
slow.: 7
sick.: 3
thank: 2
failed: 2
exam.: 2
our: 1
team: 1
won!: 1
noon!: 3
happy: 4
year!: 4
fine.: 1
proud: 2
of: 2
nervous.: 4
sick!: 3
12: 1
years: 1
old.: 1
show: 1
amazing.: 1


In [20]:
print("Exact Correct Predictions / (Correct + Incorrect) for Each Word:")

for word in set(exact_correct_predictions.keys()):
    correct_count = exact_correct_predictions.get(word, 0)
    incorrect_count = incorrect_predictions.get(word, 0)
    
    print(f"{word}: {correct_count} / {correct_count + incorrect_count}")

Exact Correct Predictions / (Correct + Incorrect) for Each Word:
worried.: 1 / 4
years: 1 / 2
proud: 1 / 3
you!: 1 / 7
slow!: 1 / 5
i: 60 / 63
fine.: 1 / 2
old.: 1 / 2
of: 1 / 3
you: 1 / 28
are: 1 / 33
am: 39 / 44
so: 1 / 9
shocked!: 1 / 4
12: 1 / 2
not: 1 / 12


# Shorter / Longer

In [11]:
from collections import defaultdict

def count_sentences_by_word_count(references):
    """Counts the number of sentences by their word count in the given list of texts."""
    word_count_dict = defaultdict(int)
    
    for text in references:
        cleaned_text = clean_text(text)
        sentences = cleaned_text.split('.')
        
        for sentence in sentences:
            words = sentence.split()
            if words:  # Only count non-empty sentences
                word_count_dict[len(words)] += 1
    
    return dict(word_count_dict)

In [12]:
result = count_sentences_by_word_count(references)
result

{3: 72, 2: 10, 4: 41, 5: 12}

In [13]:
length

[4,
 4,
 5,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 5,
 4,
 3,
 3,
 4,
 3,
 3,
 3,
 3,
 5,
 3,
 4,
 4,
 3,
 3,
 4,
 3,
 3,
 3,
 4,
 4,
 4,
 3,
 5,
 3,
 4,
 3,
 3,
 5,
 3,
 3,
 5,
 3,
 5,
 3,
 5,
 4,
 3,
 3,
 3,
 3,
 3,
 4,
 3,
 5,
 3,
 3,
 3,
 4,
 5,
 3]

In [14]:
def compare_lengths(lengths, word_count_dict):
    """Compares the provided lengths list with the word count dictionary."""
    length_counts = defaultdict(int)
    
    for length in lengths:
        length_counts[length] += 1
    
    for word_count in sorted(word_count_dict.keys()):
        num_sentences = word_count_dict[word_count]
        count_in_lengths = length_counts.get(word_count, 0)
        print(f'There are {count_in_lengths} out of {num_sentences} that are correct for {word_count}-worded sentences.')

In [15]:
compare_lengths(length, result)

There are 0 out of 10 that are correct for 2-worded sentences.
There are 38 out of 72 that are correct for 3-worded sentences.
There are 14 out of 41 that are correct for 4-worded sentences.
There are 10 out of 12 that are correct for 5-worded sentences.
