In [51]:
import pandas as pd
import re

# Load the Sinhala comments DataFrame from the CSV file
loaded_sinhala_comments_df = pd.read_csv('../../data/TKCOMMENT_7210499881536916737_3970_comments.csv')

# Define a regular expression pattern to match English words
english_word_pattern = re.compile(r'\b[a-zA-Z]+\b')

# Function to extract English words from a comment
def extract_english_words(comment):
    english_words = re.findall(english_word_pattern, comment)
    return english_words

# Apply the function to the 'Comment' column
loaded_sinhala_comments_df['English_Words'] = loaded_sinhala_comments_df['Comment'].apply(extract_english_words)

# Flatten the list of English words
english_word_list = [word for sublist in loaded_sinhala_comments_df['English_Words'] for word in sublist]

# Remove duplicates
english_word_list = list(set(english_word_list))

# Display the list of English words
print(english_word_list)

['deka', 'inna', 'Ape', 'oka', 'asawat', 'poor', 'patau', 'thanikarama', 'rekkek', 'rejister', 'aiyla', 'hemathenama', 'ghanna', 'kon', 'wereddak', 'kohomath', 'gati', 'unm', 'watinawa', 'higa', 'geniyanda', 'fake', 'thana', 'piruna', 'policy', 'higannunta', 'karanawa', 'tani', 'Hukawanna', 'nodi', 'ahanna', 'eth', 'damme', 'mey', 'come', 'suwa', 'wla', 'lowth', 'media', 'therum', 'kalaknni', 'rmb', 'krna', 'baike', 'ekektawath', 'yunipom', 'idimila', 'vage', 'henayayak', 'ban', 'lilanne', 'hri', 'anawa', 'sri', 'jelak', 'naginna', 'geeniyek', 'cariyo', 'pakekda', 'weaige', 'Kama', 'anapn', 'basith', 'Parssaman', 'watenna', 'vede', 'Aa', 'wdak', 'sakkili', 'asawk', 'hukala', 'photo', 'neh', 'gothayage', 'Sanju', 'daahan', 'gananwala', 'hadanawa', 'meeekata', 'ekkuth', 'hutthe', 'BadaLa', 'ehek', 'duty', 'namawath', 'BANDARAWELA', 'httige', 'ammage', 'asave', 'ube', 'wede', 'kari', 'botu', 'enne', 'polices', 'mawa', 'keri', 'onna', 'ekkenata', 'it', 'bari', 'da', 'thawas', 'lesi', 'amma

In [52]:
translation_dict = {
    'thama': '‡∂≠‡∑è‡∂∏',
    'moda' : '‡∂∏‡∑ú‡∂©',
    'police' : '‡∂¥‡∑ú‡∂Ω‡∑í‡∑É‡∑ä',
    'gothyek' : '‡∂ú‡∑ú‡∂≠‡∂∫‡∑ô‡∂ö‡∑ä',
    'padi': '‡∂¥‡∂Ø‡∑í',
    'me': '‡∂∏‡∑ö',
    'issues': '‡∂ú‡∑ê‡∂ß‡∑Ö‡∑î',
    'nam': '‡∂±‡∂∏‡∑ä',
    'lei': '‡∂Ω‡∑ô‡∑É',
    'badaginne': '‡∂∂‡∂Ø‡∂ú‡∑í‡∂±‡∑ä‡∂±‡∑ö',
    'hutta': '‡∑Ñ‡∑î‡∂ß‡∑ä‡∂ß',
    'un': '‡∂ã‡∂±‡∑ä',
    'kawruth': '‡∂ö‡∑Ä‡∑î‡∂ª‡∑î‡∂≠‡∑ä',
    'ahuneme': '‡∂Ö‡∑Ñ‡∑î‡∂±‡∑ô‡∂∏‡∑ö',
    'Skill': '‡∂ö‡∑ä‡∑Ç‡∂ö‡∂≠‡∑ä‡∑Ä‡∂∫',
    'Boy': '‡∂∂‡∑ú‡∂∫‡∑í',
    'bari': '‡∂∂‡∂ª‡∑í',
    'bath': '‡∂∂‡∂≠‡∑ä',
    'Save': '‡∑É‡∑î‡∂ª‡∂ö‡∑í‡∂±‡∑ä‡∂±',
    'pixel': '‡∂¥‡∑í‡∂ö‡∑ä‡∑Å‡∂Ω‡∑ä',
    'display': '‡∂¥‡∑ä‡∂ª‡∂Ø‡∂ª‡∑ä‡∑Å‡∂±‡∂∫',
    'Samsung': '‡∑É‡∑ê‡∂∏‡∑ä‡∑É‡∂±‡∑ä‡∂ú‡∑ä',
    'Ruchira': '‡∂ª‡∑î‡∂†‡∑í‡∂ª',
    'Mata': '‡∂∏‡∂ß',
    'S': '‡∂ë‡∑É‡∑ä',
    'chaina': '‡∂†‡∑ì‡∂±',
    'recommend': '‡∂±‡∑í‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂ö‡∂ª‡∂±‡∑Ä‡∑è',
    'ehat': '‡∂ë‡∑Ñ‡∑è‡∂ß',
    'ai': '‡∂í‡∂∫‡∑í',
    'Apple': '‡∂á‡∂¥‡∂Ω‡∑ä',
    'ipone': '‡∂∫‡∂∫‡∑í‡∂¥‡∑ù‡∂±‡∑ä',
    'Chamod': '‡∂†‡∑è‡∂∏‡∑ú‡∂©‡∑ä',
    'nama': '‡∂±‡∂∏',
    'samsung': '‡∑É‡∑ê‡∂∏‡∑ä‡∑É‡∂±‡∑ä‡∂ú‡∑ä',
    'A': '‡∂í',
    'ma': '‡∂∏',
    'katat': '‡∂ö‡∂ß‡∂ß‡∑ä',
    'siravatama': '‡∑É‡∑í‡∂ª‡∑Ä‡∂ß‡∂∏',
    'k': '‡∂ö‡∑ä',
    'line': '‡∑Ä‡∑í‡∑É‡∑ä‡∑Ä‡∑è‡∑É‡∑ì',
    'Technical': '‡∂≠‡∑è‡∂ö‡∑ä‡∑Ç‡∂´‡∑Ä‡∑è‡∂Ø‡∂∫',
    'Sagara': '‡∑É‡∑è‡∂ú‡∂ª',
    'eka': '‡∂ë‡∂ö',
    'kiyanna': '‡∂ö‡∑í‡∂∫‡∂±‡∑ä‡∂±',
    'iphone': '‡∂Ö‡∂∫‡∑í‡∑Ü‡∑ù‡∂±‡∑ä',
    'enne': '‡∂ë‡∂±‡∑ä‡∂±‡∑ö',
    'England': '‡∂ë‡∂Ç‡∂ú‡∂Ω‡∂±‡∑ä‡∂≠‡∂∫',
    'tv': '‡∂ß‡∑ì‡∑Ä‡∑ì',
    's': '‡∑É‡∑ì',
    'flagship': '‡∂¥‡∑ä‡∂Ω‡∑ê‡∂ú‡∑ä‡∑Ç‡∑í‡∂¥‡∑ä',
    'karana': '‡∂ö‡∂ª‡∂±‡∑Ä‡∑è',
    'wenas': '‡∑Ä‡∑ô‡∂±‡∑É‡∑ä',
    'green': '‡∂ö‡∑ú‡∑Ö',
    'ganan': '‡∂ú‡∂±‡∂±‡∑ä',
    'High': '‡∑Ñ‡∂∫‡∑í',
    'wadak': '‡∑Ä‡∂©‡∂ö‡∑ä',
    'Update': '‡∂∫‡∑è‡∑Ä‡∂≠‡∑ä‡∂ö‡∑è‡∂Ω‡∑ì',
    'Samsumg': '‡∑É‡∑ê‡∂∏‡∑ä‡∑É‡∂±‡∑ä‡∂ú‡∑ä',
    'use': '‡∂∑‡∑è‡∑Ä‡∑í‡∂≠‡∑è',
    'apple': '‡∂á‡∂¥‡∂Ω‡∑ä',
    'serios': '‡∑É‡∑ì‡∂ª‡∑í‡∂∫‡∑É‡∑ä',
    'ane': '‡∂á‡∂±‡∑ö',
    'screen': '‡∑É‡∑ä‡∂ö‡∑ä‚Äç‡∂ª‡∑ì‡∂±‡∑ä',
    'Dilshan': '‡∂Ø‡∑í‡∂Ω‡∑ä‡∑Å‡∂±‡∑ä',
    'poddk': '‡∂¥‡∑ú‡∂©‡∑ä‡∂ö‡∑ä',
    'hil': '‡∑Ñ‡∑í‡∂Ω‡∑ä',
    'Baik':'‡∂∂‡∂∫‡∑í‡∂ö‡∑ä',
    'phone': '‡∂Ø‡∑î‡∂ª‡∂ö‡∂Æ‡∂±‡∂∫',
    'Madushan': '‡∂∏‡∂∞‡∑ñ‡∑Ç‡∂±‡∑ä',
    'eke': '‡∂ë‡∂ö‡∑ö',
    'series': '‡∑É‡∑ì‡∂ª‡∑ì‡∑É‡∑ä',
    'L': '‡∂ë‡∂Ω‡∑ä',
    'photo': '‡∂°‡∑è‡∂∫‡∑è‡∂ª‡∑ñ‡∂¥',
    'na': '‡∂±‡∑ë',
    'weda': '‡∑Ä‡∑ô‡∂©',
    'kiyala': '‡∂ö‡∑í‡∂∫‡∂Ω‡∑è',
    'hodai': '‡∑Ñ‡∑ú‡∂≥‡∂∫‡∑í',
    'speed': '‡∑Ä‡∑ö‡∂ú‡∂∫',
    'ge': '‡∂ú‡∑ö',
    'ledak': '‡∂Ω‡∑ö‡∂õ',
     'wada': '‡∑Ä‡∑è‡∂©',
    'Review': '‡∂Ö‡∂Ø‡∑Ñ‡∑É‡∑ä',
    'ganak': '‡∂ú‡∂±‡∂ö‡∑ä',
    'dan': '‡∂Ø‡∂±‡∑ä',
    'ban' : '‡∂∂‡∂±‡∑ä',
    'Nirosh': '‡∂±‡∑í‡∂ª‡∑ù‡∑Ç‡∑ä',
}

print(translation_dict)


{'thama': '‡∂≠‡∑è‡∂∏', 'moda': '‡∂∏‡∑ú‡∂©', 'police': '‡∂¥‡∑ú‡∂Ω‡∑í‡∑É‡∑ä', 'gothyek': '‡∂ú‡∑ú‡∂≠‡∂∫‡∑ô‡∂ö‡∑ä', 'padi': '‡∂¥‡∂Ø‡∑í', 'me': '‡∂∏‡∑ö', 'issues': '‡∂ú‡∑ê‡∂ß‡∑Ö‡∑î', 'nam': '‡∂±‡∂∏‡∑ä', 'lei': '‡∂Ω‡∑ô‡∑É', 'badaginne': '‡∂∂‡∂Ø‡∂ú‡∑í‡∂±‡∑ä‡∂±‡∑ö', 'hutta': '‡∑Ñ‡∑î‡∂ß‡∑ä‡∂ß', 'un': '‡∂ã‡∂±‡∑ä', 'kawruth': '‡∂ö‡∑Ä‡∑î‡∂ª‡∑î‡∂≠‡∑ä', 'ahuneme': '‡∂Ö‡∑Ñ‡∑î‡∂±‡∑ô‡∂∏‡∑ö', 'Skill': '‡∂ö‡∑ä‡∑Ç‡∂ö‡∂≠‡∑ä‡∑Ä‡∂∫', 'Boy': '‡∂∂‡∑ú‡∂∫‡∑í', 'bari': '‡∂∂‡∂ª‡∑í', 'bath': '‡∂∂‡∂≠‡∑ä', 'Save': '‡∑É‡∑î‡∂ª‡∂ö‡∑í‡∂±‡∑ä‡∂±', 'pixel': '‡∂¥‡∑í‡∂ö‡∑ä‡∑Å‡∂Ω‡∑ä', 'display': '‡∂¥‡∑ä‡∂ª‡∂Ø‡∂ª‡∑ä‡∑Å‡∂±‡∂∫', 'Samsung': '‡∑É‡∑ê‡∂∏‡∑ä‡∑É‡∂±‡∑ä‡∂ú‡∑ä', 'Ruchira': '‡∂ª‡∑î‡∂†‡∑í‡∂ª', 'Mata': '‡∂∏‡∂ß', 'S': '‡∂ë‡∑É‡∑ä', 'chaina': '‡∂†‡∑ì‡∂±', 'recommend': '‡∂±‡∑í‡∑Ä‡∑ê‡∂ª‡∂Ø‡∑í ‡∂ö‡∂ª‡∂±‡∑Ä‡∑è', 'ehat': '‡∂ë‡∑Ñ‡∑è‡∂ß', 'ai': '‡∂í‡∂∫‡∑í', 'Apple': '‡∂á‡∂¥‡∂Ω‡∑ä', 'ipone': '‡∂∫‡∂∫‡∑í‡∂¥‡∑ù‡∂±‡∑ä', 'Chamod': '‡∂†‡∑è‡∂∏‡∑ú‡∂©‡∑ä', 'nama': '‡∂±‡∂∏', 'samsung': '‡∑É‡∑ê‡∂∏‡∑ä‡∑É‡∂±‡∑ä‡∂ú‡∑ä', 'A': '‡∂í', 'ma': '‡∂∏', 'kat

In [53]:
# Replace English words with their Sinhala translations in the 'Comment' column
def replace_english_with_sinhala(comment):
    words = comment.split()
    translated_words = [translation_dict.get(word, word) for word in words]
    return ' '.join(translated_words)

# Apply the replacement function to the 'Comment' column
loaded_sinhala_comments_df['Comment'] = loaded_sinhala_comments_df['Comment'].apply(replace_english_with_sinhala)

# Display the DataFrame with replaced comments
print(loaded_sinhala_comments_df[['Comment', 'Language']])

                                                Comment Language
0                                         2000 ‡∂ë‡∂ö‡∑ô‡∂ö‡∑ä‡∂Ø üòÇ       si
1                                   ‡∂∏‡∑ú‡∂© ‡∂¥‡∑ú‡∂Ω‡∑í‡∑É‡∑ä gothyeküòÇ       en
2                                         ‡∑Ñ‡∑ú‡∂Ø ‡∑Ä‡∑ê‡∂©‡∑ö..üòÇüòÇüòÇ       si
3                            ‡∂ö‡∑ú‡∂Ω‡∑ä‡∂Ω‡∂ß bike ‡∂ë‡∂ö‡∂ß ‡∂Ü‡∑É ‡∑Ñ‡∑í‡∂≠‡∑í‡∂ΩüòÖüòÖ       si
4                                              ‡∂ú‡∑Ñ‡∂¥‡∑í‡∂∫ üòÇüíî       si
...                                                 ...      ...
3965                                                  üòÇ      NaN
3966  ‡∑É‡∑í‡∂ª‡∑Ä‡∂ß‡∂∏ ‡∂∂‡∂±‡∑ä apitath ‡∂¥‡∑ú‡∂Ω‡∑í‡∑É‡∑ä ponnayo nadu danna t...       en
3967  ‡∂ö‡∑ú‡∂Ω‡∑ä‡∂Ω‡∂±‡∑ä‡∂ú‡∑ö ‡∑Ñ‡∑ì‡∂± ‡∑Ä‡∂Ω‡∂ß ‡∂á‡∑Ä‡∑í‡∂Ω‡∑ä‡∂Ω‡∑è ‡∂Ö‡∂∏‡∑ä‡∂∏‡∂ß ‡∑Ñ‡∑ê‡∂∏‡∑í‡∂±‡∑ô‡∂± ‡∑Ñ‡∑î‡∂±‡∑ä ‡∂≠...       si
3968    oya wage balu ‡∑Ä‡∑è‡∂© karama minissu iwasanne ‡∂±‡∑ë ne       ja
3969                            ‡∑É‡∂∫‡∑í‡∂Ω‡∑è‡∂±‡

In [54]:
# Save the DataFrame with replaced comments to a CSV file
loaded_sinhala_comments_df.to_csv('../../data/replaced_english_to_sinhala.csv', index=False)