Description and information about the project go here

I have at the moment done all in this jupyter notebook. We can also save these functions as regular python files and make the final product a command line version, but I think this is easiest for now.

In [11]:
# Imports
import pandas as pd
import numpy as np
import openpyxl
import xlsxwriter
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package stopwords to /home/hyyppa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/hyyppa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /home/hyyppa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/hyyppa/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:
# This block is for saving the desired data into excel format


# Extracts all line numbers of lines in the specified topic. The topic_number argument must be given as a string. For example: '9'
# Topic number 9 is politics
def save_topic_lines(path_to_topic_file, topic_number):

    topic_lines = []

    with open(path_to_topic_file, 'r') as file:
        
        i = 1

        for line in file:
            if line[0] == topic_number:
                topic_lines.append(i)
            
            i = i + 1

    return topic_lines


# Extracts all dialogue lines from a specific topic
def extract_topic(path_to_dialogue_file, path_to_topic_file, topic_number):

    topic_lines = save_topic_lines(path_to_topic_file, topic_number)
    topic_dialogue = []

    with open(path_to_dialogue_file, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file):
            if line_number in topic_lines:
                topic_dialogue.append(line)

    return topic_dialogue


# Creates a pandas dataframe for the dialogue data in a specific topic
# Rows are dialogue lines. They are in the same order as in the original dialogues_text.txt file
# Columns are utterances in that dialogue.
def create_topic_dataframe(path_to_dialogue_file, path_to_topic_file, topic_number):

    topic_dialogue = extract_topic(path_to_dialogue_file, path_to_topic_file, topic_number)
    split_dialogue = [line.split('__eou__') for line in topic_dialogue]
    topic_dialogue_data = pd.DataFrame(split_dialogue)

    return topic_dialogue_data


# Saves the dataframe in excel format
# This is just for not having to write the annoying file format
def save_dataframe_as_excel(data, filename):

    if '.xlsx' not in filename:
        filename = filename + '.xlsx'

    data.to_excel(filename, header=False, index=False)


# Does everything above. Extracts the topic, makes it into a dataframe and saves in excel format
def extract_and_save_topic_dialogue(path_to_dialogue_file, path_to_topic_file, topic_number, filename):

    topic_dialogue_data = create_topic_dataframe(path_to_dialogue_file, path_to_topic_file, topic_number)
    save_dataframe_as_excel(topic_dialogue_data, filename)


# For testing purposes

#politics = save_topic_lines('ijcnlp_dailydialog/dialogues_topic.txt', '9')
#print(politics)

#politics_dialogues = extract_topic('ijcnlp_dailydialog/dialogues_text.txt', 'ijcnlp_dailydialog/dialogues_topic.txt', '9')
#print(politics_dialogues)

#politics_dialogue_data = create_topic_dataframe('ijcnlp_dailydialog/dialogues_text.txt', 'ijcnlp_dailydialog/dialogues_topic.txt', '9')
#print(politics_dialogue_data)

#save_dataframe_as_excel(politics_dialogue_data, 'testdata.xlsx')

#extract_and_save_topic_dialogue('ijcnlp_dailydialog/dialogues_text.txt', 'ijcnlp_dailydialog/dialogues_topic.txt', '9', 'politics_dialogue_data')

In [22]:
# This block is for calculating stats for the topic data


# Takes into dataframe and concatenates everything in it to be a single string. This is for tokenization and such
def form_dialogue_string(dataframe):

    dialogue_string = ''
    rows, columns = dataframe.shape

    for row in range(rows):
        for column in range(columns):
            if type(dataframe.iat[row, column]) is str:
                dialogue_string = dialogue_string + dataframe.iat[row, column]

    return dialogue_string


# Remove punctuation, lowercase and tokenize
# There still remains things like "sure.it" and "t", remove
def preprocess_dialogue(dialogue):

    stop = set(list(string.punctuation))

    tokenized = word_tokenize(dialogue.lower())
    processed_dialogue = [word for word in tokenized if word not in stop]

    return processed_dialogue


# Calculates the vocabulary size for a dataframe
def vocabulary_size(dataframe):
    
    dialogue_string = form_dialogue_string(dataframe)
    processed_dialogue = preprocess_dialogue(dialogue_string)
    unique_tokens = set(processed_dialogue)
    vocabulary_size = len(unique_tokens)

    # For testing
    #print(processed_dialogue)
    #print(len(processed_dialogue))
    #print(processed_dialogue)
    #print(unique_tokens)

    return vocabulary_size


# Calculates the number of utterances for a dataframe
def count_utterances(dataframe):

    num_of_utterances = 0
    rows, columns = dataframe.shape

    for row in range(rows):
        for column in range(columns):
            if type(dataframe.iat[row, column]) is str:
                num_of_utterances = num_of_utterances + 1

    return num_of_utterances


# Count average tokens per utterance from a dataframe
def count_avg_tokens_per_utterance(dataframe):

    num_of_utterances = count_utterances(dataframe)

    dialogue_string = form_dialogue_string(dataframe)
    processed_dialogue = preprocess_dialogue(dialogue_string)

    avg_tokens_per_utterance = len(processed_dialogue) / num_of_utterances

    return avg_tokens_per_utterance


# Uses NLTK part of speech tagger to identify pronouns, counts the number of pronouns and then the average per utterance
def avg_pronouns_per_utterance(dataframe):

    dialogue_string = form_dialogue_string(dataframe)
    processed_dialogue = preprocess_dialogue(dialogue_string)

    tagged_dialogue = pos_tag(processed_dialogue)
    
    pronoun_count = 0

    for (token, prp_tag) in tagged_dialogue:
        if prp_tag == ('PRP' or 'PRP$'):
            pronoun_count = pronoun_count + 1

    num_of_utterances = count_utterances(dataframe)
    avg_prp = pronoun_count / num_of_utterances

    return avg_prp


# Didn't find any clear resource for agreement or negation wording.
# There is nltk.metrics.agreement, but it is not for counting agreement words
# There is also the option to try and find negation/agreement related words through wordnet, but it would also find words that are not specifially negation/agreement words
# The custom list of agreement/negation words is subject to change
# choice = 1 counts average number of agreement words
# choice = 2 does the same for negation words
def avg_agreement_negation_per_utterance(dataframe, choice):

    agreement_words = ['yes', 'ok', 'sure', 'okay', 'agreed', 'agree']
    negation_words = ['no', 'not', "don't", "can't", 'neither', ]

    if choice == 1:
        words_to_count = agreement_words
    elif choice == 2:
        words_to_count = negation_words
    else:
        print("Second argument: 1 for agreement words, 2 for negation words")
        return 0

    dialogue_string = form_dialogue_string(dataframe)
    processed_dialogue = preprocess_dialogue(dialogue_string)
    num_of_utterances = count_utterances(dataframe)

    num_words_to_count = 0

    for word in processed_dialogue:
        if word in words_to_count:
            num_words_to_count = num_words_to_count + 1
    
    avg_agreement = num_words_to_count / num_of_utterances

    return avg_agreement

# Prints all stats for a given topic
def create_stats_table(path_to_dialogue_file, path_to_topic_file, topic_number):

    topic_data = create_topic_dataframe(path_to_dialogue_file, path_to_topic_file, topic_number)

    vocab = vocabulary_size(topic_data)
    utterances = count_utterances(topic_data)
    tokens_per_utterance = count_avg_tokens_per_utterance(topic_data)
    avg_prp = avg_pronouns_per_utterance(topic_data)
    avg_agreement = avg_agreement_negation_per_utterance(topic_data, 1)
    avg_negation = avg_agreement_negation_per_utterance(topic_data, 2)

    print("Stats for topic number " + topic_number + ":")
    print("Size of vocabulary: " + str(vocab))
    print("Number of utterances: " + str(utterances))
    print("Average number of tokens per utterance: " + str(tokens_per_utterance))
    print("Average number of pronouns per utterance: " + str(avg_prp))
    print("Average number of agreement words per utterance: " + str(avg_agreement))
    print("Average number of negation words per utterance: " + str(avg_negation))

    

# For testing purposes

#politics_dialogue_data = create_topic_dataframe('ijcnlp_dailydialog/dialogues_text.txt', 'ijcnlp_dailydialog/dialogues_topic.txt', '9')

#dialogue_string = form_dialogue_string(politics_dialogue_data)
#print(dialogue_string)

#vocab = vocabulary_size(politics_dialogue_data)
#print("Size of vocabulary: " + str(vocab))

#utterances = count_utterances(politics_dialogue_data)
#print("Number of utterances: " + str(utterances))

#avg_tokens = count_avg_tokens_per_utterance(politics_dialogue_data)
#print("Average number of tokens per utterance: " + str(avg_tokens))

#politics_prp_avg = avg_pronouns_per_utterance(politics_dialogue_data)
#print("Average number of pronouns per utterance: " + str(politics_prp_avg))

#politics_agreement_avg = avg_agreement_negation_per_utterance(politics_dialogue_data, 2)
#print("Average number of agreement/negation words per utterance: " + str(politics_agreement_avg))

create_stats_table('ijcnlp_dailydialog/dialogues_text.txt', 'ijcnlp_dailydialog/dialogues_topic.txt', '9')

Stats for topic number 9:
Size of vocabulary: 1436
Number of utterances: 1635
Average number of tokens per utterance: 7.856269113149847
Average number of pronouns per utterance: 0.746177370030581
Average number of agreement words per utterance: 0.0782874617737003
Average number of negation words per utterance: 0.0672782874617737
