In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")


In [9]:
import pandas as pd
import numpy as np
import math
import random
from IPython.display import display
from itertools import combinations
from copy import deepcopy
from pprint import pprint as pp

#import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter


pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 11)
pd.set_option('display.width', 230)

In [71]:
def extract_class_definition_from_first_paragraph(text):
    """ 
    Extract the first paragraph from the text as a string.
    It is also removed from the text.
    
    This will need to be rewritten as a class later so that it
    does not have to return the text.
    
    Actually, after analysing the word counts in each first paragraph,
    I found that the most common word in the first paragraph is always
    the class definition. The problem is that for transcript2 the class
    is a phrase (fast food), not a word. I don't have time to do the 
    phrase logic right now so I am just going to pass the name of the
    class manually.
    
    """
    
    first_paragraph = str()
    
    # iterate through every line of the text
    for position, line in enumerate(text):

        # if the line contains only space we stop 
        if not line:                
            text = text[position::]
            break

        # otherwise we pop out the line and add it to the first paragraph
        else:
            first_paragraph=first_paragraph+line
            
    # return {class_name: first_paragraph}, text
    return first_paragraph, text

def extract_feature_definitions_from_remaining_text(text):
    important_words = {}
        
    # iterate through every line of the text
    for position, line in enumerate(text):
                
        # if the previous 2 lines contain only space we have reached a paragraph heading
        if not text[position-2] and not text[position-1]:
            # drop the space at the start and the end of the heading
            heading = line[1:-1]
            important_words[heading] = str()

        # otherwise we add the line to the first paragraph string
        elif line:
            important_words[heading] += line
        
    return important_words



    
def extract_important_words(input_text):
    """
    Given a text in the format of those supplied with the i2x brainteaser, this method: 
    1. extracts the first paragraph as a general definition of the text's subject (henceforth referred to as the 'class).
    2. takes each following paragraph-title as a subclass of the class.  (The subclass is really a feature of the class 
       but each subclass has words as features as well so we will separate the naming to avoid confusion.)
    3. takes each paragraph as a definition of its heading/feature
    """
    
    # Copy the text so we do not alter the original
    text = input_text.copy()
   
    first_paragraph, remaining_text = extract_class_definition_from_first_paragraph(text)
    important_words = extract_feature_definitions_from_remaining_text(remaining_text)
    
    # Get rid of the references, etc.
    for key in ['References', 'Notes', 'External links', 'Further reading', 'See also']:
        if key in important_words.keys():
            important_words.pop(key, None)

    return first_paragraph, important_words



def get_word_counts(paragraph):
    lexicon = create_lexicon(paragraph)
    word_counts = count_word_occurrences(lexicon)
    
    return word_counts

def extract_class_definition_from_text(class_name, filepath):
    
    text = import_text_and_split_on_spaces(filepath)
    
    class_definition, features = extract_important_words(text)
    class_definition = get_word_counts(class_definition)
    class_definition.rename(class_name, inplace=True)
    
    list_of_feature_series = []
    for feature, definition in features.items():
        feature_series = get_word_counts(definition)
        feature_series.rename(feature, inplace=True)
        list_of_feature_series.append(pd.DataFrame(feature_series))
        
    features_df = pd.concat(list_of_feature_series)
    
    features_df.sort_values('Allergies', inplace=True, ascending=False)
    
    return class_definition, features_df


def import_text_and_split_on_spaces(filepath):
    file_object = open(filepath, mode='r')
    # Import the text as a string
    text = file_object.read()
    # Split it into a list where each element is a line in string format
    text = text.splitlines()
    
    return text


def create_lexicon(paragraph):
    
    stop_words = set(stopwords.words("english"))
    lemmitizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    lexicon=[]
    
    all_words = word_tokenize(paragraph)
    
    for i in all_words:
        i = i.lower()
        if i not in stop_words:
            if i.isalnum():
                lexicon.append(i)

    lexicon = [lemmitizer.lemmatize(i) for i in lexicon]
    
    return lexicon


def count_word_occurrences(lexicon):
    
    w_counts = Counter(lexicon)
    
    l2 = dict(w_counts)
        
    l2 = pd.Series(l2)
    
    l2=l2/l2.sum()

    l2.sort_values(inplace=True, ascending=False)
        
    return l2


def create_lexicon_from_full_text(file_name, n):
    
    text = import_text_and_split_on_spaces(file_name)
    lexicon = []
    for line in text:
        lexicon.extend(create_lexicon(line))    

    w_counts = Counter(lexicon)
    
    l2 = dict(w_counts)
        
    l2 = pd.Series(l2)
    
    l2 = l2/l2.sum()

    l2.sort_values(inplace=True, ascending=False)
    
    #name = remove_file_extension(file_name, ".txt")
    #l2.rename(name, inplace=True)
    
    return l2.head(n)


def compare_text_similarity(definition, comparison_definition):
    
    prob = definition.multiply(comparison_definition)
    
    prob.sort_values(inplace=True, ascending=False)

    score = prob.sum()/definition.size
    
    return definition.name, score

def main():
    
    training_text_filepath = 'script1.txt'
    class_name = 'food'
    
    comparison_texts = ['transcript_1.txt', 'transcript_2.txt', 'transcript_3.txt']
    
    for comparison_text_filepath in comparison_texts:
        class_definition, class_features = extract_class_definition_from_text(class_name, training_text_filepath)

        #print('\n class_features')
        #print(class_features)

        text_definition = create_lexicon_from_full_text(comparison_text_filepath, 100)


        class_name, main_score = compare_text_similarity(class_definition, text_definition)

        results = {}
        for col in class_features.columns:
            sub_class, score = compare_text_similarity(class_features[col], text_definition)
            results[sub_class] = score

        print('The general similarity score for ' + comparison_text_filepath + ' with the class ' + class_name + ' is:', main_score)
        results = pd.Series(results)
        results.sort_values(inplace=True, ascending=False)
        print('\nWith respect to this definition, ' + comparison_text_filepath + ' is about:')
        print(results)
        print('\n-----------------------------------------------------------------------------\n')
        

    
if __name__=="__main__":
    main()

The general similarity score for transcript_1.txt with the class food is: 6.04338627732e-05

With respect to this definition, transcript_1.txt is about:
Food aid                                  2.708047e-06
Presentation                              2.460048e-06
Cultural and religious diets              2.107887e-06
Food manufacturing                        1.963722e-06
Food preparation                          1.935222e-06
Food sources                              1.825721e-06
Legal definition                          1.725496e-06
Restaurants                               1.563462e-06
Cuisine                                   1.559394e-06
Marketing and retailing                   1.536521e-06
Sour                                      1.532887e-06
International food imports and exports    1.529466e-06
Famine and hunger                         1.508493e-06
Safety                                    1.334623e-06
Production                                1.300811e-06
                      