# Data Analysis

In [1]:
import os
import sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../utils'))

from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
collect_stats = {}

In [4]:
file_name = "akuapem_with_tags_dataset-verified_data.xlsx"
path = os.path.join("../data/", file_name)

In [5]:
many_to_one_df = pd.read_excel(path, sheet_name="M-1_tags")
eng_source_one = "English (Target, One)"
akan_target_many = "Akan (Source, Many)"
cols_to_rename = {"English": eng_source_one, 
                  "Akuapem Twi": akan_target_many
                  }
many_to_one_df.rename(columns=cols_to_rename, inplace=True)
many_to_one_df

Unnamed: 0,AUD_SIZE,STATUS,AGE,FORMALITY,GENDER,GENDER.1,ANIMACY,SPEECH_ACT,"Akan (Source, Many)","English (Target, One)"
0,INDIVIDUAL,EQUAL,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Me na mewɔ ha.,I belong here.
1,INDIVIDUAL,EQUAL,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,ANSWER,Me fata sɛ mewɔ ha.,I belong here.
2,INDIVIDUAL,EQUAL,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ha na me wɔ.,I belong here.
3,INDIVIDUAL,,PEER,FORMAL,MASCULINE,,ANIMATE,STATEMENT,Kwaku to dwom yiye.,Kwaku sings quite well.
4,INDIVIDUAL,,PEER,FORMAL,MASCULINE,,ANIMATE,STATEMENT,Kwaku nim nwom to.,Kwaku sings quite well.
...,...,...,...,...,...,...,...,...,...,...
395,INDIVIDUAL,,ELDER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,QUESTION,Ne su te sɛn?,What's it like?
396,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Na minhu nea menyɛ.,I ran out of ideas.
397,INDIVIDUAL,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Me nsusuiɛ asa.,I ran out of ideas.
398,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Menni adwene biara bio.,I ran out of ideas.


In [6]:
# what your speech can make someone do
SPEECH_ACT_COL_NAME = 'SPEECH_ACT'
speech_act_entries = many_to_one_df[SPEECH_ACT_COL_NAME].unique()
speech_act_entries

array(['STATEMENT', 'ANSWER', 'QUESTION', 'COMMAND', 'REQUEST'],
      dtype=object)

In [7]:
def separate_by_entry(df, col_name, entry_idx):
    entry_name = df[col_name].unique()[entry_idx]
    filt_statement = df[col_name] == entry_name
    new_df = df[filt_statement]
    return new_df

In [8]:
many_to_one_statement_df = separate_by_entry(many_to_one_df, SPEECH_ACT_COL_NAME, 0)
many_to_one_statement_df

Unnamed: 0,AUD_SIZE,STATUS,AGE,FORMALITY,GENDER,GENDER.1,ANIMACY,SPEECH_ACT,"Akan (Source, Many)","English (Target, One)"
0,INDIVIDUAL,EQUAL,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Me na mewɔ ha.,I belong here.
2,INDIVIDUAL,EQUAL,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ha na me wɔ.,I belong here.
3,INDIVIDUAL,,PEER,FORMAL,MASCULINE,,ANIMATE,STATEMENT,Kwaku to dwom yiye.,Kwaku sings quite well.
4,INDIVIDUAL,,PEER,FORMAL,MASCULINE,,ANIMATE,STATEMENT,Kwaku nim nwom to.,Kwaku sings quite well.
5,INDIVIDUAL,,PEER,INFORMAL,NEUTRAL,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls her up every night.
...,...,...,...,...,...,...,...,...,...,...
390,INDIVIDUAL,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Wo ne me nnyɛ basaa.,Don't mess with me.
396,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Na minhu nea menyɛ.,I ran out of ideas.
397,INDIVIDUAL,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Me nsusuiɛ asa.,I ran out of ideas.
398,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Menni adwene biara bio.,I ran out of ideas.


In [9]:
many_to_one_answer_df = separate_by_entry(many_to_one_df, SPEECH_ACT_COL_NAME, 1)
many_to_one_question_df = separate_by_entry(many_to_one_df, SPEECH_ACT_COL_NAME, 2)
many_to_one_command_df = separate_by_entry(many_to_one_df, SPEECH_ACT_COL_NAME, 3)
many_to_one_request_df = separate_by_entry(many_to_one_df, SPEECH_ACT_COL_NAME, 4)

many_to_one_collect_stats = {
    f'{speech_act_entries[0]}': len(many_to_one_statement_df),
    f'{speech_act_entries[1]}': len(many_to_one_answer_df),
    f'{speech_act_entries[2]}': len(many_to_one_question_df),
    f'{speech_act_entries[3]}': len(many_to_one_command_df),
    f'{speech_act_entries[4]}': len(many_to_one_request_df)
}
many_to_one_collect_stats

{'STATEMENT': 300, 'ANSWER': 1, 'QUESTION': 71, 'COMMAND': 27, 'REQUEST': 1}

In [10]:
one_to_many_df = pd.read_excel(path, sheet_name="1-M_tags")
eng_source_one = "English (Target, Many)"
akan_target_many = "Akan (Source, One)"
cols_to_rename = {"English": eng_source_one, 
                  "Akuapem Twi": akan_target_many
                  }
one_to_many_df.rename(columns=cols_to_rename, inplace=True)
one_to_many_df

Unnamed: 0,AUD_SIZE,STATUS,AGE,FORMALITY,GENDER,GENDER_2,ANIMACY,SPEECH_ACT,"Akan (Source, One)","English (Target, Many)"
0,INDIVIDUAL,,PEER,INFORMAL,MASCULINE,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls her every night.
1,INDIVIDUAL,,PEER,INFORMAL,MASCULINE,MASCULINE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls him every night.
2,INDIVIDUAL,,PEER,INFORMAL,FEMININE,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,She calls her every night.
3,INDIVIDUAL,,PEER,INFORMAL,MASCULINE,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls her every night.
4,INDIVIDUAL,,PEER,INFORMAL,FEMININE,MASCULINE,ANIMATE,STATEMENT,"Anɔpa yi, ohyiaa no.",She met him this morning.
...,...,...,...,...,...,...,...,...,...,...
458,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,INANIMATE,STATEMENT,"Nokwarem no, osu bɛtɔ.",It is definitely going to rain.
459,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,INANIMATE,STATEMENT,"Nokwarem no, osu bɛtɔ.",Rain is surely on the way.
460,SMALL GROUP,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ɛsɛ sɛ yehu akokoaa foforo bi a wɔawo no foforo.,We've got to find a new babysitter.
461,SMALL GROUP,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ɛsɛ sɛ yehu akokoaa foforo bi a wɔawo no foforo.,We need to look for another babysitter.


In [None]:
one_to_many_statement_df = separate_by_entry(one_to_many_df, SPEECH_ACT_COL_NAME, 0)
one_to_many_answer_df = separate_by_entry(one_to_many_df, SPEECH_ACT_COL_NAME, 1)
one_to_many_question_df = separate_by_entry(one_to_many_df, SPEECH_ACT_COL_NAME, 2)
one_to_many_command_df = separate_by_entry(one_to_many_df, SPEECH_ACT_COL_NAME, 3)
one_to_many_request_df = separate_by_entry(one_to_many_df, SPEECH_ACT_COL_NAME, 4)

one_to_many_collect_stats = {
    f'{speech_act_entries[0]}': len(one_to_many_statement_df),
    f'{speech_act_entries[1]}': len(one_to_many_answer_df),
    f'{speech_act_entries[2]}': len(one_to_many_question_df),
    f'{speech_act_entries[3]}': len(one_to_many_command_df),
    f'{speech_act_entries[4]}': len(one_to_many_request_df)
}
one_to_many_collect_stats

{'STATEMENT': 374, 'ANSWER': 3, 'QUESTION': 54, 'COMMAND': 22, 'REQUEST': 10}