# Data Analysis

In [1]:
import os
import sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../utils'))

from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
collect_stats = {}

In [4]:
file_name = "akuapem_dataset - gold_standard.csv"
path = os.path.join("../data/", file_name)

df = DataProcessing.load_data(path)
df.head(7)

Unnamed: 0.1,Unnamed: 0,Akuapem,Unnamed: 2,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,,He calls him every night
2,,ha na me wɔ,,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,,He calls her every night
4,,Kwaku nim nwom to,,"Anɔpa yi, ohyiaa no.",She met him this morning
5,He calls her up every night.,Anadwo biara ɔfrɛ no,,,She met her this morning
6,,ɔfrɛ no anadwo biara,,,He met her this morning


## Drop Columns + Rename Columns

In [5]:
cols_to_drop = ['Unnamed: 2']
DataProcessing.drop_data_from_df(df, cols_to_drop)

Unnamed: 0.1,Unnamed: 0,Akuapem,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,He calls him every night
2,,ha na me wɔ,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,He calls her every night
4,,Kwaku nim nwom to,"Anɔpa yi, ohyiaa no.",She met him this morning
...,...,...,...,...
687,,S[n na [te,,
688,,Ne su te s[n?,,
689,I ran out of ideas.,Ná minhu nea menyɛ.,,
690,,Me nsusui[ asa,,


## Akan (Many) : English (One)

In [6]:
akan_source_many = "Akan (Source, Many)"
eng_target_one = "English (Target, One)"
cols_to_rename = {
    "Akuapem": akan_source_many,
    "Unnamed: 0": eng_target_one
    }
many_to_one_cols = [1, 0]
base_akan_to_eng_df = DataProcessing.split_df_mappings(df, many_to_one_cols)

DataProcessing.rename_df_cols(base_akan_to_eng_df, cols_to_rename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=cols, inplace=True)


Unnamed: 0,"Akan (Source, Many)","English (Target, One)"
0,Me na mewɔ ha.,I belong here.
1,Me fata sɛ mewɔ ha,
2,ha na me wɔ,
3,Kwaku to dwom yiye.,Kwaku sings quite well.
4,Kwaku nim nwom to,
...,...,...
687,S[n na [te,
688,Ne su te s[n?,
689,Ná minhu nea menyɛ.,I ran out of ideas.
690,Me nsusui[ asa,


### Base Code

1. Counts
2. Unique words
3. Sentiment

In [7]:
def counts_from_mappings_df(df: pd.DataFrame, col_name: str, stats: dict):
    no_nan_df = df.loc[: , [col_name]].dropna()
    stats[col_name] = len(no_nan_df)

    return stats

In [8]:
def get_unique_words(df: pd.DataFrame, col_name: str, stats: dict):

    unique = set()
    
    sentences_df = df[col_name].dropna()
    words = sentences_df.str.lower().str.findall("\w+")
    
    for x in words:
        unique.update(x)
    
    print(unique.__len__())
    stats[f"#words in {col_name}"] = unique.__len__()

In [9]:
from transformers import pipeline
import pandas as pd

# Load sentiment analysis pipeline
load_sentiment_pipeline = pipeline("sentiment-analysis")

def get_sentiment(df: pd.DataFrame, col_name: str, sentiment_pipeline) -> pd.DataFrame:
    non_na_df = df[[col_name]].dropna().copy()
    sentences = non_na_df[col_name].tolist()
    
    sentiments = []
    for sentence in sentences:
        result = sentiment_pipeline(sentence)[0]
        sentiments.append(result['label'])
    
    result_df = pd.DataFrame({
        'sentence': sentences,
        'sentiment': sentiments
    })
    
    return result_df

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [10]:
def get_sentiment_label_counts(df: pd.DataFrame, col_name: str, label: str, stats: dict):
    labels_with_counts = df[col_name].value_counts().to_dict()

    stats[f"{col_name}-{label}"] = labels_with_counts

### Counts for English (One) : Akan (Many):

1. [x] English (Source, One)
2. [x] Akan (Target, Many)

In [11]:
counts_from_mappings_df(base_akan_to_eng_df, akan_source_many, collect_stats)
counts_from_mappings_df(base_akan_to_eng_df, eng_target_one, collect_stats)

{'Akan (Source, Many)': 399, 'English (Target, One)': 125}

### Unique Words

In [12]:
get_unique_words(base_akan_to_eng_df, akan_source_many, collect_stats)
get_unique_words(base_akan_to_eng_df, eng_target_one, collect_stats)

612
308


In [13]:
collect_stats

{'Akan (Source, Many)': 399,
 'English (Target, One)': 125,
 '#words in Akan (Source, Many)': 612,
 '#words in English (Target, One)': 308}

### Sentiment

In [14]:
akan_source_sentiment_df = get_sentiment(base_akan_to_eng_df, akan_source_many, load_sentiment_pipeline)
akan_source_sentiment_df

Unnamed: 0,sentence,sentiment
0,Me na mewɔ ha.,NEGATIVE
1,Me fata sɛ mewɔ ha,NEGATIVE
2,ha na me wɔ,NEGATIVE
3,Kwaku to dwom yiye.,POSITIVE
4,Kwaku nim nwom to,NEGATIVE
...,...,...
394,S[n na [te,NEGATIVE
395,Ne su te s[n?,NEGATIVE
396,Ná minhu nea menyɛ.,NEGATIVE
397,Me nsusui[ asa,NEGATIVE


In [15]:
collect_stats

{'Akan (Source, Many)': 399,
 'English (Target, One)': 125,
 '#words in Akan (Source, Many)': 612,
 '#words in English (Target, One)': 308}

In [16]:
eng_target_sentiment_df = get_sentiment(base_akan_to_eng_df, eng_target_one, load_sentiment_pipeline)
eng_target_sentiment_df

Unnamed: 0,sentence,sentiment
0,I belong here.,NEGATIVE
1,Kwaku sings quite well.,POSITIVE
2,He calls her up every night.,POSITIVE
3,I don't have any money at all.,NEGATIVE
4,I love your dress.,POSITIVE
...,...,...
120,Kwame isn't happy about what's happened.,NEGATIVE
121,I'm leaving.,POSITIVE
122,Don't mess with me.,POSITIVE
123,What's it like?,POSITIVE


In [17]:
get_sentiment_label_counts(akan_source_sentiment_df, 'sentiment', akan_source_many, collect_stats)
get_sentiment_label_counts(eng_target_sentiment_df, 'sentiment', eng_target_one, collect_stats)

In [18]:
collect_stats

{'Akan (Source, Many)': 399,
 'English (Target, One)': 125,
 '#words in Akan (Source, Many)': 612,
 '#words in English (Target, One)': 308,
 'sentiment-Akan (Source, Many)': {'NEGATIVE': 391, 'POSITIVE': 8},
 'sentiment-English (Target, One)': {'NEGATIVE': 63, 'POSITIVE': 62}}

## Akan (One): English (Many)

In [19]:
akan_source_one = "Akan (Source, One)"
eng_target_many = "English (Target, Many)"
cols_to_rename = {"Akuapem ": akan_source_one, 
                  "English": eng_target_many
                  }
many_to_one_cols = [2, 3]
base_akan_english_df = DataProcessing.split_df_mappings(df, many_to_one_cols)

DataProcessing.rename_df_cols(base_akan_english_df , cols_to_rename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=cols, inplace=True)


Unnamed: 0,"Akan (Source, One)","English (Target, Many)"
0,Anadwo biara ɔfrɛ no,He calls her every night
1,,He calls him every night
2,,She calls her every night
3,,He calls her every night
4,"Anɔpa yi, ohyiaa no.",She met him this morning
...,...,...
687,,
688,,
689,,
690,,


### Counts for Akan (One): English (Many)

1. Akan (Source, One)
2. English (Target, Many)

In [20]:
counts_from_mappings_df(base_akan_english_df, akan_source_one, collect_stats)
counts_from_mappings_df(base_akan_english_df, eng_target_many, collect_stats)

{'Akan (Source, Many)': 399,
 'English (Target, One)': 125,
 '#words in Akan (Source, Many)': 612,
 '#words in English (Target, One)': 308,
 'sentiment-Akan (Source, Many)': {'NEGATIVE': 391, 'POSITIVE': 8},
 'sentiment-English (Target, One)': {'NEGATIVE': 63, 'POSITIVE': 62},
 'Akan (Source, One)': 132,
 'English (Target, Many)': 463}

### Unique Words

In [21]:
get_unique_words(base_akan_english_df, akan_source_one, collect_stats)
get_unique_words(base_akan_english_df, eng_target_many, collect_stats)

424
743


In [22]:
collect_stats

{'Akan (Source, Many)': 399,
 'English (Target, One)': 125,
 '#words in Akan (Source, Many)': 612,
 '#words in English (Target, One)': 308,
 'sentiment-Akan (Source, Many)': {'NEGATIVE': 391, 'POSITIVE': 8},
 'sentiment-English (Target, One)': {'NEGATIVE': 63, 'POSITIVE': 62},
 'Akan (Source, One)': 132,
 'English (Target, Many)': 463,
 '#words in Akan (Source, One)': 424,
 '#words in English (Target, Many)': 743}

### Sentiment

In [23]:
akan_source_sentiment_df = get_sentiment(base_akan_english_df, akan_source_one, load_sentiment_pipeline)
akan_source_sentiment_df

Unnamed: 0,sentence,sentiment
0,Anadwo biara ɔfrɛ no,NEGATIVE
1,"Anɔpa yi, ohyiaa no.",NEGATIVE
2,Merenyɛ saa nnɛ.,NEGATIVE
3,Nea ɛwɔ he na meyɛ?,NEGATIVE
4,Misua nnwom.,NEGATIVE
...,...,...
127,Mmea ani gye ho saa.,NEGATIVE
128,M'ani gye nhoma akenkan ho.,NEGATIVE
129,Ɔwɔ suban pa ankasa.,NEGATIVE
130,"Nokwarem no, osu bɛtɔ.",NEGATIVE


In [24]:
eng_target_sentiment_df = get_sentiment(base_akan_english_df, eng_target_many, load_sentiment_pipeline)
eng_target_sentiment_df

Unnamed: 0,sentence,sentiment
0,He calls her every night,POSITIVE
1,He calls him every night,POSITIVE
2,She calls her every night,POSITIVE
3,He calls her every night,POSITIVE
4,She met him this morning,POSITIVE
...,...,...
458,It is definitely going to rain,POSITIVE
459,Rain is surely on the way,POSITIVE
460,We've got to find a new babysitter.,NEGATIVE
461,We need to look for another babysitter,NEGATIVE


In [25]:
get_sentiment_label_counts(akan_source_sentiment_df, 'sentiment', akan_source_one, collect_stats)
get_sentiment_label_counts(eng_target_sentiment_df, 'sentiment', eng_target_many, collect_stats)

In [26]:
collect_stats

{'Akan (Source, Many)': 399,
 'English (Target, One)': 125,
 '#words in Akan (Source, Many)': 612,
 '#words in English (Target, One)': 308,
 'sentiment-Akan (Source, Many)': {'NEGATIVE': 391, 'POSITIVE': 8},
 'sentiment-English (Target, One)': {'NEGATIVE': 63, 'POSITIVE': 62},
 'Akan (Source, One)': 132,
 'English (Target, Many)': 463,
 '#words in Akan (Source, One)': 424,
 '#words in English (Target, Many)': 743,
 'sentiment-Akan (Source, One)': {'NEGATIVE': 130, 'POSITIVE': 2},
 'sentiment-English (Target, Many)': {'POSITIVE': 238, 'NEGATIVE': 225}}