# Data Analysis

In [1]:
import os
import sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../utils'))

from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
collect_stats = {}

In [4]:
file_name = "akuapem_dataset - gold_standard.csv"
path = os.path.join("../data/", file_name)

df = DataProcessing.load_data(path)
df.head(7)

Unnamed: 0.1,Unnamed: 0,Akuapem,Unnamed: 2,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,,He calls him every night
2,,ha na me wɔ,,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,,He calls her every night
4,,Kwaku nim nwom to,,"Anɔpa yi, ohyiaa no.",She met him this morning
5,He calls her up every night.,Anadwo biara ɔfrɛ no,,,She met her this morning
6,,ɔfrɛ no anadwo biara,,,He met her this morning


## Drop Columns + Rename Columns

In [5]:
cols_to_drop = ['Unnamed: 2']
DataProcessing.drop_data_from_df(df, cols_to_drop)

Unnamed: 0.1,Unnamed: 0,Akuapem,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,He calls him every night
2,,ha na me wɔ,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,He calls her every night
4,,Kwaku nim nwom to,"Anɔpa yi, ohyiaa no.",She met him this morning
...,...,...,...,...
687,,S[n na [te,,
688,,Ne su te s[n?,,
689,I ran out of ideas.,Ná minhu nea menyɛ.,,
690,,Me nsusui[ asa,,


## English (One) : Akan (Many)

In [6]:
eng_source_one = "English (Source, One)"
akan_target_many = "Akan (Target, Many)"
cols_to_rename = {"Unnamed: 0": eng_source_one, 
                  "Akuapem": akan_target_many
                  }
one_to_many_cols = [0, 1]
base_eng_to_akan_df = DataProcessing.split_df_mappings(df, one_to_many_cols)

DataProcessing.rename_df_cols(base_eng_to_akan_df, cols_to_rename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=cols, inplace=True)


Unnamed: 0,"English (Source, One)","Akan (Target, Many)"
0,I belong here.,Me na mewɔ ha.
1,,Me fata sɛ mewɔ ha
2,,ha na me wɔ
3,Kwaku sings quite well.,Kwaku to dwom yiye.
4,,Kwaku nim nwom to
...,...,...
687,,S[n na [te
688,,Ne su te s[n?
689,I ran out of ideas.,Ná minhu nea menyɛ.
690,,Me nsusui[ asa


### Counts for English (One) : Akan (Many):

1. [x] English (Source, One)
2. [x] Akan (Target, Many)

In [7]:
def counts_from_mappings_df(df: pd.DataFrame, col_name: str, stats: dict):
    no_nan_df = df.loc[: , [col_name]].dropna()
    stats[col_name] = len(no_nan_df)

    return stats

counts_from_mappings_df(base_eng_to_akan_df, eng_source_one, collect_stats)
counts_from_mappings_df(base_eng_to_akan_df, akan_target_many, collect_stats)

{'English (Source, One)': 125, 'Akan (Target, Many)': 399}

## Akan (One): English (Many)

In [8]:
akan_source_one = "Akan (Source, One)"
eng_target_many = "English (Target, Many)"
cols_to_rename = {"Akuapem ": akan_source_one, 
                  "English": eng_target_many
                  }
many_to_one_cols = [2, 3]
base_akan_english_df = DataProcessing.split_df_mappings(df, many_to_one_cols)

DataProcessing.rename_df_cols(base_akan_english_df , cols_to_rename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=cols, inplace=True)


Unnamed: 0,"Akan (Source, One)","English (Target, Many)"
0,Anadwo biara ɔfrɛ no,He calls her every night
1,,He calls him every night
2,,She calls her every night
3,,He calls her every night
4,"Anɔpa yi, ohyiaa no.",She met him this morning
...,...,...
687,,
688,,
689,,
690,,


### Counts for Akan (One): English (Many)

1. Akan (Source, One)
2. English (Target, Many)

In [9]:
counts_from_mappings_df(base_akan_english_df, akan_source_one, collect_stats)
counts_from_mappings_df(base_akan_english_df, eng_target_many, collect_stats)

{'English (Source, One)': 125,
 'Akan (Target, Many)': 399,
 'Akan (Source, One)': 132,
 'English (Target, Many)': 463}