Step 1: import libraries

In [None]:

import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import words as nltk_words
from googletrans import Translator
from lib.pandas_database_functions import queryCSV,createReplaceCSV,jsonToDict # Import the specific function you need
from lib.nlp import translate_word,translate_words
nltk.download('words')

Step 2: import and clean dataset

In [None]:
uncleaned_df = queryCSV("datasets/raw_messages.csv")
print(uncleaned_df)
cleaned_df = uncleaned_df.dropna()
uncleaned_df.reset_index(drop=True,inplace=True)

Step 3: extract all words from convo, A word is defined as
1. a word is a space between letters and ending before the next space or end of line
2. belongs to a the english dictionary (nltk corpus) 
A corpus is a collection of machine-readable authentic texts,

caution: the second step will take a longer time to complete, you do not have to run this step again unless you have do not have a nlp_processed.csv file

In [None]:
def extractWordsFromText(arr,cleaned_df):
    user_words_df= cleaned_df[cleaned_df['user_id']==cleaned_df['sender_id']]
    for words in user_words_df['text']:
        res = words.split()
        for word in res:
            arr.append(word)
def is_english_word(word):
    if set(word.lower()) <= set('ha'):
        return False
    return word.lower() in set(nltk_words.words())

def extractWordsIntoDF(array):
    lowercase_words = [word.lower() for word in array]
    word_count = dict(Counter(lowercase_words))
    df = pd.DataFrame(list(word_count.items()), columns=['Word', 'Count'])
    df['isWord'] = df['Word'].apply(lambda x: is_english_word(x))
    df = df.sort_values(by='Count', ascending=False)
    return df


In [None]:

words_arr=[]
extractWordsFromText(words_arr,cleaned_df)
words_df= extractWordsIntoDF(words_arr)
print(words_df)
createReplaceCSV(words_df,"datasets/words.csv")

Step 4: convert to the appropriate text of another language and pronunciation (sound via url)

In [None]:
translator = Translator()
new_words_df=queryCSV("datasets/words.csv")
language_dict = jsonToDict("translator_registry/languages.json")
new_words_df = new_words_df[new_words_df['isWord'] == True]
new_words_df.reset_index(drop=True,inplace=True)
## remove the rows with empty 
truncated_words_df = new_words_df.head(50)
# Create a copy of the DataFrame
truncated_words_df = truncated_words_df.copy()

# Apply translation using .loc to avoid SettingWithCopyWarning
lang_code ='ko'

translations = truncated_words_df['Word'].apply(
    lambda x: translate_word(x, language_code=lang_code, translator=translator)
)
# Create separate columns for translation and pronunciation
truncated_words_df[['Translation', 'Pronunciation']] = pd.DataFrame(translations.tolist(), index=truncated_words_df.index)
print(truncated_words_df)
truncated_words_df['language_code'] = lang_code
truncated_words_df['language'] = truncated_words_df['language_code'].apply(lambda x: language_dict.get(x, 'Unknown'))
createReplaceCSV(truncated_words_df,"datasets/translated_words.csv")


In [None]:
''' ALTERNATIVE METHOD:
# Set language_code
arrayOfTranslations = translate_words(truncated_words_df['Word'],lang_code,translator)
# Create a DataFrame from the translation array
translation_df = pd.DataFrame(arrayOfTranslations, columns=['Translation', 'Pronunciation'])
# Concatenate the existing DataFrame and the translation DataFrame column-wise
result_df = pd.concat([truncated_words_df, translation_df],axis =1)
result_df['language_code'] = lang_code
result_df['language'] = result_df['language_code'].apply(lambda x: language_dict.get(x, 'Unknown'))


createReplaceCSV(result_df,"datasets/translated_words.csv")
'''