# Data Analysis

In [1]:
import os
import sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../utils'))

from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
collect_stats = {}

In [4]:
file_name = "akuapem_dataset - gold_standard.csv"
path = os.path.join("../data/", file_name)

df = DataProcessing.load_data(path)
df.head(7)

Unnamed: 0.1,Unnamed: 0,Akuapem,Unnamed: 2,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,,He calls him every night
2,,ha na me wɔ,,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,,He calls her every night
4,,Kwaku nim nwom to,,"Anɔpa yi, ohyiaa no.",She met him this morning
5,He calls her up every night.,Anadwo biara ɔfrɛ no,,,She met her this morning
6,,ɔfrɛ no anadwo biara,,,He met her this morning


## Drop Columns + Rename Columns

In [5]:
cols_to_drop = ['Unnamed: 2']
DataProcessing.drop_data_from_df(df, cols_to_drop)

Unnamed: 0.1,Unnamed: 0,Akuapem,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,He calls him every night
2,,ha na me wɔ,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,He calls her every night
4,,Kwaku nim nwom to,"Anɔpa yi, ohyiaa no.",She met him this morning
...,...,...,...,...
687,,S[n na [te,,
688,,Ne su te s[n?,,
689,I ran out of ideas.,Ná minhu nea menyɛ.,,
690,,Me nsusui[ asa,,


In [6]:
cols_to_rename = {"Unnamed: 0": "English (Source, One)", 
                  "Akuapem": "Akan (Target, Many)"
                  }
one_to_many_cols = [0, 1]
base_one_to_many_df = DataProcessing.split_df_mappings(df, one_to_many_cols)

DataProcessing.rename_df_cols(base_one_to_many_df, cols_to_rename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=cols, inplace=True)


Unnamed: 0,"English (Source, One)","Akan (Target, Many)"
0,I belong here.,Me na mewɔ ha.
1,,Me fata sɛ mewɔ ha
2,,ha na me wɔ
3,Kwaku sings quite well.,Kwaku to dwom yiye.
4,,Kwaku nim nwom to
...,...,...
687,,S[n na [te
688,,Ne su te s[n?
689,I ran out of ideas.,Ná minhu nea menyɛ.
690,,Me nsusui[ asa


In [7]:
no_nan_one_to_many_df = base_one_to_many_df.iloc[: , [0]].dropna()
no_nan_one_to_many_df

Unnamed: 0,"English (Source, One)"
0,I belong here.
3,Kwaku sings quite well.
5,He calls her up every night.
7,I don't have any money at all.
9,I love your dress.
...,...
677,Kwame isn't happy about what's happened.
680,I'm leaving.
683,Don't mess with me.
686,What's it like?


## Counts for English (Source, One)

In [None]:
collect_stats[f'{no_nan_one_to_many_df.columns[0]}'] = len(no_nan_one_to_many_df)
collect_stats

{'English (Source, One)': 125}

In [9]:
rows = no_nan_one_to_many_df.values
rows

array([['I belong here.'],
       ['Kwaku sings quite well.'],
       ['He calls her up every night.'],
       ["I don't have any money at all."],
       ['I love your dress.'],
       ['This is what I can do for you.'],
       ['She met him this morning.'],
       ["I'd rather not go out today."],
       ['What time will you leave?'],
       ["Don't lose your temper."],
       ['Which is mine?'],
       ['We left at 2:30.'],
       ['She likes animals, you know?'],
       ["You haven't got much time."],
       ['Sooner or later, we all are going to die.'],
       ['Kwaku has next to nothing in his wallet.'],
       ["I'm Kwaku's boss."],
       ["It's their job."],
       ["That's your problem."],
       ['Kwaku recognized me.'],
       ['What language do they speak in Switzerland?'],
       ['Almost three thousand people died.'],
       ["We'll notify Kwaku."],
       ["I don't have time to be sick."],
       ["Her son's death broke Abena's heart."],
       ["That isn't Kwaku's bicyc

In [10]:
cleaned_entry = []
for rows_idx in range(len(rows)):
    sentence = rows[rows_idx]
    # print(type(sentence), sentence)
    updated_sentence = ' '.join(sentence)
    # print(type(updated_sentence), updated_sentence)
    cleaned_entry.append(updated_sentence)

no_nan_one_to_many_df['English (Source, One)'] = cleaned_entry
no_nan_one_to_many_df

Unnamed: 0,"English (Source, One)"
0,I belong here.
3,Kwaku sings quite well.
5,He calls her up every night.
7,I don't have any money at all.
9,I love your dress.
...,...
677,Kwame isn't happy about what's happened.
680,I'm leaving.
683,Don't mess with me.
686,What's it like?


In [11]:
cleaned_entry

['I belong here.',
 'Kwaku sings quite well.',
 'He calls her up every night.',
 "I don't have any money at all.",
 'I love your dress.',
 'This is what I can do for you.',
 'She met him this morning.',
 "I'd rather not go out today.",
 'What time will you leave?',
 "Don't lose your temper.",
 'Which is mine?',
 'We left at 2:30.',
 'She likes animals, you know?',
 "You haven't got much time.",
 'Sooner or later, we all are going to die.',
 'Kwaku has next to nothing in his wallet.',
 "I'm Kwaku's boss.",
 "It's their job.",
 "That's your problem.",
 'Kwaku recognized me.',
 'What language do they speak in Switzerland?',
 'Almost three thousand people died.',
 "We'll notify Kwaku.",
 "I don't have time to be sick.",
 "Her son's death broke Abena's heart.",
 "That isn't Kwaku's bicycle.",
 'No one showed up.',
 'She made friends with Asamoah at the party.',
 'Cain was evil.',
 "I've been sick for a very long time.",
 'Who told you the story?',
 "We'll lose everything.",
 'Shall I go on?

In [12]:
base_one_to_many_pt = DataProcessing.df_to_pivot_table(base_one_to_many_df, 'English (Source, One)', ["English (Source, One)", "Akan (Target, Many)"])
base_one_to_many_pt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_to_update] = DataProcessing.fill_na_with_value(df, col_to_update)


"English (Source, One)","Akan (Target, Many)"
Act like a man.,Di wo dwuma sɛ ɔbarima
Act like a man.,Ma mmarisɛm mmra wo mu
Act like a man.,Yɛ w'ade te sɛ onipa.
Act like a man.,Yɛ ɔkokoɔdurufoɔ
Almost three thousand people died.,Nnipa a wɔwui no bɛduru mpem mmiɛnsa
...,...
You haven't got much time.,Wo mmerɛ a aka sua
You haven't got much time.,Wonni mmre
You taught us that.,Nea wokyerɛɛ yɛn ara no no
You taught us that.,Saa ara na wokyerɛɛ yɛn


In [13]:
DataProcessing.drop_data_from_df(base_one_to_many_pt, dropna=True)

"English (Source, One)","Akan (Target, Many)"
Act like a man.,Di wo dwuma sɛ ɔbarima
Act like a man.,Ma mmarisɛm mmra wo mu
Act like a man.,Yɛ w'ade te sɛ onipa.
Act like a man.,Yɛ ɔkokoɔdurufoɔ
Almost three thousand people died.,Nnipa a wɔwui no bɛduru mpem mmiɛnsa
...,...
You haven't got much time.,Wo mmerɛ a aka sua
You haven't got much time.,Wonni mmre
You taught us that.,Nea wokyerɛɛ yɛn ara no no
You taught us that.,Saa ara na wokyerɛɛ yɛn


### NaN exists in Akan (Target, Many)

In [None]:
base_one_to_many_df.iloc[: , [1]]

In [None]:
base_one_to_many_df.iloc[: , [1]].dropna()

### No NaN in English (Source, One)

In [None]:
base_one_to_many_df.iloc[: , [0]]

In [None]:
base_one_to_many_df.iloc[: , [0]].dropna()

Word cloud

In [None]:
stats = {} 
stats['English (Source, One) --- N --- NaN'] = len(base_one_to_many_df.iloc[: , [0]])
stats['Akan (Target, Many) --- N --- NaN'] = len(base_one_to_many_df.iloc[: , [1]])
stats['Akan (Source, One) --- N'] = len(base_one_to_many_df.iloc[: , [0]].dropna())
stats['English (Target, Many) --- N'] = len(base_one_to_many_df.iloc[: , [1]].dropna())
stats

In [None]:
# pip install -q transformers
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
# data = ["I love you", "I hate you"]
sentiments = sentiment_pipeline(cleaned_entry)
sentiments

In [None]:
no_nan_one_to_many_df['Sentiments'] = sentiments
no_nan_one_to_many_df

## POS + NER

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
pos = []
import spacy

nlp = spacy.load("en_core_web_sm")

for idx, row in no_nan_one_to_many_df.iterrows():
    sentence = row['English (Source, One)']
    # print(sentence)
    doc = nlp(sentence)
    # print(doc)

    for token in doc:
        # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                # token.shape_, token.is_alpha, token.is_stop)
        print(sentence, token.pos_)
        pos.append(token.pos_)

In [None]:
len(pos)

In [None]:
no_nan_one_to_many_df['POS'] = pos
no_nan_one_to_many_df