In [1]:
import os.path

def clean_up(s):
    """ Return a version of string str in which all letters have been
    converted to lowercase and punctuation characters have been stripped
    from both ends. Inner punctuation is left untouched. """

    punctuation = '''!"',;:.-?)([]<>*#\n\t\r'''
    result = s.lower().strip(punctuation)
    return result

In [2]:
def words_in_a_dictionary(line_list):
    word_dictionary = {}
    for i in range(len(line_list)):
        list_of_tokens_in_a_line = line_list[i].split()
        for j in range(len(list_of_tokens_in_a_line)):
            word = clean_up(list_of_tokens_in_a_line[j])
            if word != "":
                if word in word_dictionary:
                    word_dictionary[word] += 1
                else:
                    word_dictionary[word] = 1
    return word_dictionary

    # Take parameter line_list, which we assume is a list of
    # strings, and return a dictionary called word_dictionary
    # whose keys are all the words that appear in every element
    # of line_list while the values associated with the keys are
    # the number of times each word appears in line_list. This
    # is a helper function not specified in the brief created to
    # avoid repeating code.

In [3]:
def average_word_length(line_list):
    ''' Return the average length of all words in line_list.
    Do not include surrounding punctuation in words. 
    text is a non-empty list of strings each ending in \n.
    At least one line in text contains a word.'''

    word_dictionary = words_in_a_dictionary(line_list)
    character_count = 0
    word_count = 0
    for word in word_dictionary:
        character_count += len(word) * word_dictionary[word]
        word_count += word_dictionary[word]
    average_length = character_count / word_count
    return average_length

In [4]:
def unique_words_ratio(line_list):
    ''' Return the type token ratio (TTR) for this line_list.
    TTR is the number of different words divided by the total number of words.
    text is a non-empty list of strings each ending in \n.
    At least one line in text contains a word. '''

    word_dictionary = words_in_a_dictionary(line_list)
    different_words_count = len(word_dictionary)
    word_count = 0
    for word in word_dictionary:
        word_count += word_dictionary[word]
    unique_ratio = different_words_count / word_count
    return unique_ratio

In [5]:
def hapax_legomana_ratio(line_list):
    ''' Return the hapax_legomana ratio for this text.
    This ratio is the number of words that occur exactly once divided
    by the total number of words.
    text is a list of strings each ending in \n.
    At least one line in text contains a word.'''

    word_dictionary = words_in_a_dictionary(line_list)
    unique_words_count = 0
    word_count = 0
    for word in word_dictionary:
        word_count += word_dictionary[word]
        if word_dictionary[word] == 1:
            unique_words_count += 1
    hl_ratio = unique_words_count / word_count
    return hl_ratio

In [6]:
def most_frequent_word(word_dictionary):
    count = 0
    most_freq_word = ""
    for word in word_dictionary:
        if word_dictionary[word] > count:
            most_freq_word = word
            count = word_dictionary[word]
    return most_freq_word

    # Take parameter word_dictionary, which we assume is a dictionary
    # whose keys are strings and the keys' associated values are strictly
    # positive integers. Compute and return the key in the dictionary
    # that has the greatest value associated to it. This is a helper
    # function used to simplify the most_frequent_n_words function.

In [7]:
def most_frequent_n_words(line_list, n):
    word_dictionary = words_in_a_dictionary(line_list)
    for i in range(n):
        most_freq_word = most_frequent_word(word_dictionary)
        print("\"" + most_freq_word + "\", appearing " +
              str(word_dictionary[most_freq_word]) + " times.")
        word_dictionary.pop(most_freq_word)

    # Take parameter line_list, which we assume is a list of
    # strings, and parameter n, which we assume is a positive
    # integer. Print the first n most frequent words that
    # appear in the elements of line_list, and also the number
    # of times each of these words appear. This function does
    # not return anything.

In [8]:
def word_detect(line_list, searched_word, m):
    x = 0
    separators_list = ["!", "\"", "'", ",", ":", ";", ".", "-", "?", ")",
                       "(", "[", "]", "<", ">", "*", "#", "\n", "\t", "\r"]
    word_dictionary = words_in_a_dictionary(line_list)
    maxi = word_dictionary[searched_word]
    for i in range(len(line_list)):
        position_list = []
        list_of_words_in_a_line = []
        list_of_tokens_in_a_line = line_list[i].split()
        for j in range(len(list_of_tokens_in_a_line)):
            word = clean_up(list_of_tokens_in_a_line[j])
            list_of_words_in_a_line.append(word)
        if searched_word in list_of_words_in_a_line:
            print("\"" + searched_word + "\"" + " appears in line " + str(i+1)
                  + " at position(s): ", end="")
            for k in range(len(line_list[i]) - len(searched_word) + 1):
                if line_list[i].lower()[k: k + len(searched_word)]\
                        == searched_word:
                    position = k + 1
                    bool_val = True
                    a = k - 1
                    b = k + len(searched_word)
                    while a >= 0 and line_list[i][a] != " ":
                        if line_list[i][a] not in separators_list:
                            bool_val = False
                        a -= 1
                    while b <= len(line_list[i]) - 1 and line_list[i][b] != " ":
                        if line_list[i][b] not in separators_list:
                            bool_val = False
                        b += 1
                    if bool_val is True:
                        position_list.append(position)
            for p in range(len(position_list)):
                x += 1
                if p < len(position_list) - 1:
                    if x < m:
                        print(str(position_list[p]), end=", ")
                    else:
                        print(str(position_list[p]), end="\n")
                else:
                    print(str(position_list[p]), end="\n")
                if x == m:
                    break
        if x == m:
            break
    if maxi > 1:
        if 1 < x < maxi:
            print("You found the first " + str(x) + " instances of this word.")
        elif x == 1:
            print("You found the first instance of this word.")
        elif x == maxi:
            print("You found all " + str(x) + " instances of this word.")
        else:
            print("It should be impossible for there to be this many"
                  + "instances found.")
    else:
        if x == maxi:
            print("You found the single instance of this word.")
        else:
            print("It should be impossible for there to be this many"
                  + "instances found.")

    # Take line_list, searched_word and m as parameters. We
    # assume that line_list if a list of strings, that searched_word
    # is a cleaned-up string that can be found somewhere in the
    # list of words that appear in line_list, and that m is a strictly
    # positive integer. Print the location of the first m instances
    # of searched_word. The location means the line and the column,
    # which I call position in this function, as shown by the notepad
    # application. Also print how many instances of the searched_word
    # were found. Even though this number should be given by m, in
    # the case that m is greater than the number of searched_words that
    # actually exist in line_list, then m and the number of found instances
    # can be different. For example, if there are 3 "abcd" words in
    # line_list and m is 3 or greater, the function will print that
    # all 3 instances of "abcd" were found. If m is 1 or 2, the
    # function will print that it found the first instance or the
    # first 2 instances respectively. This function doesn't return anything.

In [9]:
def split_text(original, separators):
    '''Return a list of non-empty, non-blank strings from the original string
    determined by splitting the string on any of the separators.
    separators is a string of single-character separators.'''

    result = []
    separator_list = []
    for i in range(len(separators)):
        separator_list.append(separators[i])
    sequence = ""
    for j in range(len(original)):
        if original[j] not in separator_list:
            sequence += original[j]
        else:
            if sequence.strip() != "":
                result.append(sequence)
            sequence = ""
    if sequence.strip() != "":
        result.append(sequence)
    return result

In [10]:
def sentence_list(text_from_file):
    file_string = "".join(text_from_file)
    sentence_with_spaces_on_margins_list = split_text(file_string, "!?.")
    final_sentence_list = []
    for element in sentence_with_spaces_on_margins_list:
        final_sentence_list.append(element.strip())
    return final_sentence_list

    # Take parameter text_from_file, which we assume is a list of
    # strings, and first join all of the elements of the text_from_file
    # list in a single string, called file_string. Then create and
    # return a list that contains all of the sentences in file_string
    # by splitting file_string on the the characters that terminate a
    # sentence, namely ".?!", and then stripping each of the elements
    # that result from the split because sentences are defined as
    # sequences of characters that exclude whitespace on either end.
    # This is a helper function not specified in the brief used to
    # avoid repeating code.

In [11]:
def average_sentence_length(text_from_file):
    ''' Return the average number of words per sentence in text.
    text_from_file is guaranteed to have at least one sentence.
    Sentence terminating punctuation defined as !?.

    A sentence is defined as a non-empty string of non-terminating
    punctuation surrounded by terminating punctuation
    or beginning or end of file. '''

    final_sentence_list = sentence_list(text_from_file)
    sentence_count = len(final_sentence_list)
    word_count = 0
    word_dictionary = words_in_a_dictionary(final_sentence_list)
    for word in word_dictionary:
        word_count += word_dictionary[word]
    average_length = word_count / sentence_count
    return average_length

In [12]:
def average_sentence_complexity(text_from_file):
    '''Return the average number of phrases per sentence.
    Terminating punctuation defined as !?.
    A sentence is defined as a non-empty string of non-terminating
    punctuation surrounded by terminating punctuation
    or beginning or end of file.
    Phrases are substrings of a sentences separated by
    one or more of the following delimiters ,;: '''

    final_sentence_list = sentence_list(text_from_file)
    sentence_count = len(final_sentence_list)
    final_phrases_list = []
    for i in range(len(final_sentence_list)):
        final_phrases_list += split_text(final_sentence_list[i], ",:;")
    phrases_count = len(final_phrases_list)
    average_complexity = phrases_count / sentence_count
    return average_complexity

In [13]:
def get_valid_filename(prompt):
    '''Use prompt (a string) to ask the user to type the name of a file. If
    the file does not exist, keep asking until they give a valid filename.
    Return the name of that file.'''

    filename = input(prompt)
    while os.path.exists(filename) is False:
        print("That file does not exist")
        filename = input(prompt)
    return filename

    # To do: Complete this function's body to meet its specification.
    # Uncomment and use this statement as many times as needed for input:
    # filename = input(prompt)
    # Uncomment and use this statement as many times as needed for output:
    # print "That file does not exist."
    # Do not use any other input or output statements in this function.


In [14]:
def read_directory_name(prompt):
    '''Use prompt (a string) to ask the user to type the name of a directory. If
    the directory does not exist, keep asking until they give a valid directory.
    '''

    dirname = input(prompt)
    while os.path.isdir(dirname) is False:
        print("That directory does not exist")
        dirname = input(prompt)
    return dirname

    # To do: Complete this function's body to meet its specification.
    # Uncomment and use this statement as many times as needed for input:
    # dirname = input(prompt)
    # Uncomment and use this statement as many times as needed for output:
    # print("That directory does not exist.")
    # Do not use any other input or output statements in this function.


In [15]:
def compare_signatures(sig1, sig2, weight):
    '''Return a non-negative real number indicating the similarity of two
    linguistic signatures. The smaller the number the more similar the
    signatures. Zero indicates identical signatures.
    sig1 and sig2 are 6 element lists with the following elements
    0  : author name (a string)
    1  : average word length (float)
    2  : TTR (float)
    3  : Hapax Legomana Ratio (float)
    4  : average sentence length (float)
    5  : average sentence complexity (float)
    weight is a list of multiplicative weights to apply to each
    linguistic feature. weight[0] is ignored.
    '''

    sums = 0
    for i in range(1, 6):
        sums += abs(sig1[i] - sig2[i]) * weight[i]
    return sums

    # To do: Replace this function's body to meet its specification.

In [16]:
def read_signature(filename):
    '''Read a linguistic signature from filename and return it as
    list of features. '''

    file = open(filename, 'r')
    # the first feature is a string so it doesn't need casting to float
    result = [file.readline()]
    # all remaining features are real numbers
    for line in file:
        result.append(float(line.strip()))
    return result

In [17]:
if __name__ == '__main__':
    
    prompt = 'enter the name of the file with unknown author (options are mystery1.txt, mystery2.txt,...,mystery5.txt): '
    mystery_filename = get_valid_filename(prompt)

    # readlines gives us a list of strings one for each line of the file
    text = open(mystery_filename, 'r').readlines()

    # calculate the signature for the mystery file
    mystery_signature = [mystery_filename]
    mystery_signature.append(average_word_length(text))
    mystery_signature.append(unique_words_ratio(text))
    mystery_signature.append(hapax_legomana_ratio(text))
    mystery_signature.append(average_sentence_length(text))
    mystery_signature.append(average_sentence_complexity(text))
    weights = [0, 11, 33, 50, 0.4, 4]

    print(mystery_signature)
    print()

    # Prints the signature of the mystery text file for verification.

    nr = int(input("Enter how many most frequent words you want displayed."
                   + " Enter only a natural positive number: "))
    while nr < 0:
        nr = int(input("Invalid input. Enter only a positive natural number: "))
    most_frequent_n_words(text, nr)
    print()

    # Prompt the user to enter how many most frequent words he wants displayed.
    # The while loop is used to ensure the fact that the user enters a positive
    # integer. The input is then used as a parameter for the most_frequent_n_words
    # function so that the function will print the amount of most frequent words
    # that the user demands. The other parameter used by the function is the list of
    # lines that was obtained by using readlines() on the file entered previously
    # by the user. Thus the user will know which are the most frequent words in the
    # file that he entered, and he can control how many most frequent words will
    # be displayed.

    word_dict = words_in_a_dictionary(text)
    command = input("Do you want to search a word? Enter only 'yes' or 'no': ")
    while command != "yes" and command != "no":
        command = input("Invalid input. Enter only 'yes' or 'no': ")
    while command == "yes":
        word_to_search = input("Which word do you want to search?: ")
        cleaned_up_searched_word = clean_up(word_to_search)
        if cleaned_up_searched_word in word_dict:
            maximum = word_dict[cleaned_up_searched_word]
            print("There are " + str(maximum) + " instances of this word in this file. "
                  + "Even if you try to find a larger number of words, you'll only find "
                  + str(maximum) + ".")
            num = int(input("How many instances of this word do you want to search?"
                            + " Enter only a strictly positive integer: "))
            while num < 1:
                num = int(input("Invalid input. Enter only a strictly positive integer: "))
            word_detect(text, cleaned_up_searched_word, num)
        else:
            print("There are no instances of this word in the file.")
        command = input("Search another word? Enter only 'yes' or 'no': ")
        while command != "yes" and command != "no":
            command = input("Invalid input. Enter only 'yes' or 'no': ")
    print()

    # Prompt the user to enter whether or not he wants to search a word.
    # A while loop is used to ensure that the user enters either "yes"
    # or "no". Another while loop is used to keep asking the user if he
    # wants to search for another word after he's done looking up the
    # previous word. As long as he enters "yes" the user will be asked
    # to enter a word, which will be cleaned-up. If the word he entered is
    # in the file he previously entered, then the total number of times
    # this word appears will be displayed, in order to inform the user that
    # he won't be able to find more instances of the word he wants to search
    # than this. However he can enter a number greater than that. Then the
    # user is prompted to enter the first how many instances of the word he
    # wants to search. Another while loop is used to ensure that this number
    # is at least 1. The function word_detect is then called, which will
    # display the location of the first num instances of the searched word.
    # If the searched word isn't in the file, then the user will be given
    # a message with this information.

    prompt = 'enter the path to the directory of signature files: '
    dir = read_directory_name(prompt)
    # every file in this directory must be a linguistic signature
    files = os.listdir(dir)

    # we will assume that there is at least one signature in that directory
    this_file = files[0]
    signature = read_signature('%s/%s' % (dir, this_file))
    best_score = compare_signatures(mystery_signature, signature, weights)
    best_author = signature[0]
    for this_file in files[1:]:
        signature = read_signature('%s/%s' % (dir, this_file))
        score = compare_signatures(mystery_signature, signature, weights)
        if score < best_score:
            best_score = score
            best_author = signature[0]
    print("best author match: %swith score %s" % (best_author, best_score))

enter the name of the file with unknown author (options are mystery1.txt, mystery2.txt,...,mystery5.txt): mystery1.txt
['mystery1.txt', 4.425098689749363, 0.06561231298838174, 0.0367069740756216, 15.050060481994976, 2.3668930864427282]

Enter how many most frequent words you want displayed. Enter only a natural positive number: 50
"the", appearing 5288 times.
"to", appearing 5216 times.
"and", appearing 4602 times.
"of", appearing 4377 times.
"a", appearing 3115 times.
"i", appearing 2892 times.
"her", appearing 2373 times.
"it", appearing 2372 times.
"was", appearing 2370 times.
"she", appearing 2214 times.
"in", appearing 2194 times.
"not", appearing 2124 times.
"be", appearing 1980 times.
"you", appearing 1956 times.
"that", appearing 1752 times.
"he", appearing 1732 times.
"had", appearing 1612 times.
"as", appearing 1438 times.
"for", appearing 1322 times.
"have", appearing 1320 times.
"with", appearing 1248 times.
"is", appearing 1247 times.
"but", appearing 1206 times.
"very", a