# Data Analysis

In [1]:
import os
import sys

import pandas as pd

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../utils'))

from data_processing import DataProcessing

In [2]:
pd.set_option('max_colwidth', 800)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
collect_stats = {}

In [4]:
file_name = "akuapem_dataset - gold_standard.csv"
path = os.path.join("../data/", file_name)

df = DataProcessing.load_data(path)
df.head(7)

Unnamed: 0.1,Unnamed: 0,Akuapem,Unnamed: 2,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,,He calls him every night
2,,ha na me wɔ,,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,,He calls her every night
4,,Kwaku nim nwom to,,"Anɔpa yi, ohyiaa no.",She met him this morning
5,He calls her up every night.,Anadwo biara ɔfrɛ no,,,She met her this morning
6,,ɔfrɛ no anadwo biara,,,He met her this morning


## Drop Columns + Rename Columns

In [5]:
cols_to_drop = ['Unnamed: 2']
DataProcessing.drop_data_from_df(df, cols_to_drop)

Unnamed: 0.1,Unnamed: 0,Akuapem,Akuapem.1,English
0,I belong here.,Me na mewɔ ha.,Anadwo biara ɔfrɛ no,He calls her every night
1,,Me fata sɛ mewɔ ha,,He calls him every night
2,,ha na me wɔ,,She calls her every night
3,Kwaku sings quite well.,Kwaku to dwom yiye.,,He calls her every night
4,,Kwaku nim nwom to,"Anɔpa yi, ohyiaa no.",She met him this morning
...,...,...,...,...
687,,S[n na [te,,
688,,Ne su te s[n?,,
689,I ran out of ideas.,Ná minhu nea menyɛ.,,
690,,Me nsusui[ asa,,


In [6]:
cols_to_rename = {"Unnamed: 0": "English (Source, One)", 
                  "Akuapem": "Akan (Target, Many)"
                  }
one_to_many_cols = [0, 1]
base_one_to_many_df = DataProcessing.split_df_mappings(df, one_to_many_cols)

DataProcessing.rename_df_cols(base_one_to_many_df, cols_to_rename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=cols, inplace=True)


Unnamed: 0,"English (Source, One)","Akan (Target, Many)"
0,I belong here.,Me na mewɔ ha.
1,,Me fata sɛ mewɔ ha
2,,ha na me wɔ
3,Kwaku sings quite well.,Kwaku to dwom yiye.
4,,Kwaku nim nwom to
...,...,...
687,,S[n na [te
688,,Ne su te s[n?
689,I ran out of ideas.,Ná minhu nea menyɛ.
690,,Me nsusui[ asa


In [14]:
from transformers import pipeline
import pandas as pd

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def get_sentiment(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    non_na_df = df[[col_name]].dropna().copy()
    sentences = non_na_df[col_name].tolist()
    
    sentiments = []
    for sentence in sentences:
        result = sentiment_pipeline(sentence)[0]
        sentiments.append(result['label'])
    
    result_df = pd.DataFrame({
        'sentence': sentences,
        'sentiment': sentiments
    })
    
    return result_df

# Example usage
sentiment_df = get_sentiment(base_one_to_many_df, 'English (Source, One)')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [15]:
sentiment_df

Unnamed: 0,sentence,sentiment
0,I belong here.,NEGATIVE
1,Kwaku sings quite well.,POSITIVE
2,He calls her up every night.,POSITIVE
3,I don't have any money at all.,NEGATIVE
4,I love your dress.,POSITIVE
...,...,...
120,Kwame isn't happy about what's happened.,NEGATIVE
121,I'm leaving.,POSITIVE
122,Don't mess with me.,POSITIVE
123,What's it like?,POSITIVE


In [None]:
# no_nan_one_to_many_df.mean()

## Counts for English (Source, One)

In [None]:
collect_stats[f'{no_nan_one_to_many_df.columns[0]}'] = len(no_nan_one_to_many_df)
collect_stats

In [None]:
rows = no_nan_one_to_many_df.values
rows

In [None]:
cleaned_entry = []
for rows_idx in range(len(rows)):
    sentence = rows[rows_idx]
    # print(type(sentence), sentence)
    updated_sentence = ' '.join(sentence)
    # print(type(updated_sentence), updated_sentence)
    cleaned_entry.append(updated_sentence)

no_nan_one_to_many_df['English (Source, One)'] = cleaned_entry
no_nan_one_to_many_df

In [None]:
cleaned_entry

In [None]:
base_one_to_many_pt = DataProcessing.df_to_pivot_table(base_one_to_many_df, 'English (Source, One)', ["English (Source, One)", "Akan (Target, Many)"])
base_one_to_many_pt

In [None]:
DataProcessing.drop_data_from_df(base_one_to_many_pt, dropna=True)

### NaN exists in Akan (Target, Many)

In [None]:
base_one_to_many_df.iloc[: , [1]]

In [None]:
base_one_to_many_df.iloc[: , [1]].dropna()

### No NaN in English (Source, One)

In [None]:
base_one_to_many_df.iloc[: , [0]]

In [None]:
base_one_to_many_df.iloc[: , [0]].dropna()

Word cloud

In [None]:
stats = {} 
stats['English (Source, One) --- N --- NaN'] = len(base_one_to_many_df.iloc[: , [0]])
stats['Akan (Target, Many) --- N --- NaN'] = len(base_one_to_many_df.iloc[: , [1]])
stats['Akan (Source, One) --- N'] = len(base_one_to_many_df.iloc[: , [0]].dropna())
stats['English (Target, Many) --- N'] = len(base_one_to_many_df.iloc[: , [1]].dropna())
stats

In [None]:
base_one_to_many_df.iloc[: , [0]].dropna()

In [None]:
def get_sentiment(df: pd.DataFrame, col_name: str):
    df.loc[: , col_name].dropna()

    for idx, row in df.iterrows():
        row[col_name]
        print(idx, row[col_name])

get_sentiment(base_one_to_many_df, 'English (Source, One)')

In [None]:
# pip install -q transformers
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")
# data = ["I love you", "I hate you"]
sentiments = sentiment_pipeline(base_one_to_many_df.iloc[: , [0]])
sentiments

In [None]:
no_nan_one_to_many_df['Sentiments'] = sentiments
no_nan_one_to_many_df

## POS + NER

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
pos = []
import spacy

nlp = spacy.load("en_core_web_sm")

for idx, row in no_nan_one_to_many_df.iterrows():
    sentence = row['English (Source, One)']
    # print(sentence)
    doc = nlp(sentence)
    # print(doc)

    for token in doc:
        # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                # token.shape_, token.is_alpha, token.is_stop)
        print(sentence, token.pos_)
        pos.append(token.pos_)

In [None]:
len(pos)

In [None]:
no_nan_one_to_many_df['POS'] = pos
no_nan_one_to_many_df