### Remove:
* Null values, new_line("\n"), hashtags("#*"), emojis, other characters
### Replace:
* ['ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሖ'] with ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']
* ['ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ'] with ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']
* ['ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሦ', 'ሦ', 'ሧ'] with ['ሰ, 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ']
* ['ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ'] with ['አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ']
* ['ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ'] with ['ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ']


#### Import modules

In [1]:
import sys, os, json, re
import pandas as pd
# sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [None]:
class Util():
    def __init__(self) -> None:
        self.emoji_pattern = re.compile("["
                                        u"\U0001F600-\U0001F64F"  # Emoticons
                                        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
                                        u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                                        u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                                        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                                        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                                        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                                        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                                        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                                        u"\u2600-\u26FF"  # Miscellaneous Symbols
                                        u"\u2700-\u27BF"  # Dingbats
                                        u"\u2B50"  # Star
                                        u"\U0001F1E6-\U0001F1FF"  # Country Flags
                                        "]+", flags=re.UNICODE)
        self.symbols = re.compile("["
                                  "\""
                                  "\“"
                                  "\""
                                  "\'"
                                  "\-"
                                  "\*"
                                  "\•"
                                  "\ℹ"
                                  "\﻿"
                                  "\_"
                                  "]+")
        self.url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        self.mention_pattern = r'@(\w+)'

    def read_file(self, file_path: str) -> dict:
        # Open the file for reading
        with open(file_path, 'r') as file:
            # Load the JSON data from the file
            data = json.load(file)
            return data

    def write_file(self, file_path: str, data: dict) -> None:
        # Open the file for writing
        with open(file_path, 'w') as file:
            # Dump the JSON data to the file
            json.dump(data, file, indent=2)

    def parse_text(self, text: any) -> str:
        if isinstance(text, str):
            return text
        elif isinstance(text, list):
            contents = []
            for item in text:
                if isinstance(item, str):
                    contents.append(item)
                elif isinstance(item, dict):
                    contents.append(item['text'])
            return "".join(contents)
        else:
            return ""

    def parse_messages(self, messages: list) -> dict:
        parsed_messages = {
            'id': [],
            'text': [],
            'date': []
        }
        for message in messages:
            if message['type'] != 'message' or len(message['text']) == 0:
                continue
            parsed_messages['id'].append(message['id'])
            message_content = self.parse_text(message['text'])
            parsed_messages['text'].append(message_content)
            parsed_messages['date'].append(message['date'])
        return parsed_messages

    def extract_hashtags(self, text: str) -> list:
        return [word for word in text.split() if word.startswith('#')]

    def extract_emojis(self, text):
        return ''.join(self.emoji_pattern.findall(text))

    def remove_emojis(self, text):
        return self.emoji_pattern.sub('', text)

    def extract_symbols(self, text):
        return ''.join(self.symbols.findall(text))

    def remove_symbols(self, text):
        return self.symbols.sub(' ', text)

    def extract_urls(self, text):
        return re.findall(self.url_pattern, text)

    def extract_mentions(self, text):
        return re.findall(self.mention_pattern, text)


In [None]:
def file_reader(path: str, ) -> str:
    fname = os.path.join(path)
    with open(fname, 'r') as f:
        system_message = f.read()
    return system_message

#### Init Variables

In [2]:
parsed_dir = "../data/parsed"
cleaned_dir = "../data/cleaned"
file_name = "TIKVAH"
util = Util()

#### Read parsed data

In [3]:
df = pd.read_csv(f"{parsed_dir}/{file_name}.csv", index_col='id')
df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12,ታሪክ የሌለው ህዝብ ታሪካዊ ስራ ለመስራት አይጓጓም። ነፃነትም የማያውቅ ...,2017-07-27T11:37:49
14,ኢትዮጵያ ኣደይ !\n\n@tikvahethiopia,2017-07-27T11:52:34
15,አደራ ልጄ \n\nገንዘብ ውርሴን አምጪ አትበይኝ አደራ\nመኪና ቪላ ፎቅ ...,2017-07-27T12:13:05
17,ኢትዮጵያ ሀገሬ መመኪያ ነሽ ክብሬ !\n\n@tikvahethiopia,2017-07-27T13:15:01
18,ኢትዮጵያዊነት\n\nኢትዮጵያዊነት ብዙ የተለያዩ ኅብረተሰቦች የተዋኅዱበት ...,2017-07-27T13:37:46


In [4]:
df.shape

(39754, 2)

#### Remove null values

In [5]:
df = df.dropna()
df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12,ታሪክ የሌለው ህዝብ ታሪካዊ ስራ ለመስራት አይጓጓም። ነፃነትም የማያውቅ ...,2017-07-27T11:37:49
14,ኢትዮጵያ ኣደይ !\n\n@tikvahethiopia,2017-07-27T11:52:34
15,አደራ ልጄ \n\nገንዘብ ውርሴን አምጪ አትበይኝ አደራ\nመኪና ቪላ ፎቅ ...,2017-07-27T12:13:05
17,ኢትዮጵያ ሀገሬ መመኪያ ነሽ ክብሬ !\n\n@tikvahethiopia,2017-07-27T13:15:01
18,ኢትዮጵያዊነት\n\nኢትዮጵያዊነት ብዙ የተለያዩ ኅብረተሰቦች የተዋኅዱበት ...,2017-07-27T13:37:46


In [6]:
df.shape

(39754, 2)

#### Remove new line

In [7]:
df = df.replace('\n', ' ', regex=True)
df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
12,ታሪክ የሌለው ህዝብ ታሪካዊ ስራ ለመስራት አይጓጓም። ነፃነትም የማያውቅ ...,2017-07-27T11:37:49
14,ኢትዮጵያ ኣደይ ! @tikvahethiopia,2017-07-27T11:52:34
15,አደራ ልጄ ገንዘብ ውርሴን አምጪ አትበይኝ አደራ መኪና ቪላ ፎቅ አትበ...,2017-07-27T12:13:05
17,ኢትዮጵያ ሀገሬ መመኪያ ነሽ ክብሬ ! @tikvahethiopia,2017-07-27T13:15:01
18,ኢትዮጵያዊነት ኢትዮጵያዊነት ብዙ የተለያዩ ኅብረተሰቦች የተዋኅዱበት አካ...,2017-07-27T13:37:46


#### Extract and remove hasthags

In [8]:
# Extract hashtags
df['hashtags'] = df['text'].apply(lambda x: util.extract_hashtags(x))
df.head()

Unnamed: 0_level_0,text,date,hashtags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12,ታሪክ የሌለው ህዝብ ታሪካዊ ስራ ለመስራት አይጓጓም። ነፃነትም የማያውቅ ...,2017-07-27T11:37:49,[]
14,ኢትዮጵያ ኣደይ ! @tikvahethiopia,2017-07-27T11:52:34,[]
15,አደራ ልጄ ገንዘብ ውርሴን አምጪ አትበይኝ አደራ መኪና ቪላ ፎቅ አትበ...,2017-07-27T12:13:05,[]
17,ኢትዮጵያ ሀገሬ መመኪያ ነሽ ክብሬ ! @tikvahethiopia,2017-07-27T13:15:01,[]
18,ኢትዮጵያዊነት ኢትዮጵያዊነት ብዙ የተለያዩ ኅብረተሰቦች የተዋኅዱበት አካ...,2017-07-27T13:37:46,[]


In [9]:
# Remove hashtags from text
df['text'] = df['text'].str.replace(r'\#\w+', '', regex=True)
df.head()

Unnamed: 0_level_0,text,date,hashtags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12,ታሪክ የሌለው ህዝብ ታሪካዊ ስራ ለመስራት አይጓጓም። ነፃነትም የማያውቅ ...,2017-07-27T11:37:49,[]
14,ኢትዮጵያ ኣደይ ! @tikvahethiopia,2017-07-27T11:52:34,[]
15,አደራ ልጄ ገንዘብ ውርሴን አምጪ አትበይኝ አደራ መኪና ቪላ ፎቅ አትበ...,2017-07-27T12:13:05,[]
17,ኢትዮጵያ ሀገሬ መመኪያ ነሽ ክብሬ ! @tikvahethiopia,2017-07-27T13:15:01,[]
18,ኢትዮጵያዊነት ኢትዮጵያዊነት ብዙ የተለያዩ ኅብረተሰቦች የተዋኅዱበት አካ...,2017-07-27T13:37:46,[]


#### Extract emojis

In [10]:
# Extract emojis using regex
df['emojis'] = df['text'].apply(util.extract_emojis)
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
84203,“ በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” - አቶ አበባው አያሌው ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",
84205,“ ቀብሩ ዛሬ ተፈጽሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ዓመ...,2024-01-11T21:59:26,[],
84207,ℹ የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ...,2024-01-12T00:16:19,[],
84209,የጠቅላይ ሚኒስትሩ የብሔራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",
84217,""" በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ...",2024-01-12T00:54:13,[#ተገደለ።],


In [11]:
df_83826 = df.loc[83826]
df_83826

text        @samcomptech ⭐ አሁንም አዳዲስ  ላፕቶፕች ገብተዋል!!!!  ብዛት...
date                                      2023-12-28T11:13:14
hashtags                                                   []
emojis                                                    ⭐👉✅
Name: 83826, dtype: object

#### Remove emojis from text

In [12]:
df['text'] = df['text'].apply(util.remove_emojis)

In [13]:
df_83826 = df.loc[83826]
df_83826

text        @samcomptech  አሁንም አዳዲስ  ላፕቶፕች ገብተዋል!!!!  ብዛትም...
date                                      2023-12-28T11:13:14
hashtags                                                   []
emojis                                                    ⭐👉✅
Name: 83826, dtype: object

#### Replace letters:
* ['ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሖ'] with ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']
* ['ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ'] with ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']
* ['ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሦ', 'ሦ', 'ሧ'] with ['ሰ, 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ']
* ['ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ'] with ['አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ']
* ['ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ'] with ['ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ']

In [14]:
letters = [
  [['ሐ', 'ሑ', 'ሒ', 'ሓ', 'ሔ', 'ሖ'], ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']],
  [['ኀ', 'ኁ', 'ኂ', 'ኃ', 'ኄ', 'ኅ', 'ኆ'], ['ሀ', 'ሁ', 'ሂ', 'ሃ', 'ሄ', 'ህ', 'ሆ']],
  [['ሠ', 'ሡ', 'ሢ', 'ሣ', 'ሤ', 'ሦ', 'ሦ', 'ሧ'], ['ሰ', 'ሱ', 'ሲ', 'ሳ', 'ሴ', 'ስ', 'ሶ', 'ሷ']],
  [['ዐ', 'ዑ', 'ዒ', 'ዓ', 'ዔ', 'ዕ', 'ዖ'], ['አ', 'ኡ', 'ኢ', 'ኣ', 'ኤ', 'እ', 'ኦ']],
  [['ጸ', 'ጹ', 'ጺ', 'ጻ', 'ጼ', 'ጽ', 'ጾ'], ['ፀ', 'ፁ', 'ፂ', 'ፃ', 'ፄ', 'ፅ', 'ፆ']]
]
for letter in letters:
  for i in range(len(letter[0])):
    df['text'] = df['text'].str.replace(letter[0][i], letter[1][i])
    


#### Extract symbols

In [15]:
df['symbols'] = df['text'].apply(util.extract_symbols)
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
84203,“ በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” - አቶ አበባው አያሌው ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““
84205,“ ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመ...,2024-01-11T21:59:26,[],,“-““““
84207,ℹ የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ...,2024-01-12T00:16:19,[],,ℹ
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,""""""""""
84217,""" በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ...",2024-01-12T00:54:13,[#ተገደለ።],,"""""-"


In [16]:
df['text'] = df['text'].apply(util.remove_symbols)
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
84203,በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” አቶ አበባው አያሌው ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““
84205,ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመ...,2024-01-11T21:59:26,[],,“-““““
84207,የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ...,2024-01-12T00:16:19,[],,ℹ
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,""""""""""
84217,በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ...,2024-01-12T00:54:13,[#ተገደለ።],,"""""-"


#### Extract Links

In [17]:
df['links'] = df['text'].apply(util.extract_urls)
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84203,በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” አቶ አበባው አያሌው ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““,[]
84205,ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመ...,2024-01-11T21:59:26,[],,“-““““,[]
84207,የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ...,2024-01-12T00:16:19,[],,ℹ,[]
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,"""""""""",[]
84217,በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ...,2024-01-12T00:54:13,[#ተገደለ።],,"""""-",[]


#### Remove links 

In [18]:
df['text'] = df['text'].str.replace(util.url_pattern, '', regex=True).str.strip()
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84203,በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” አቶ አበባው አያሌው በአዲ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““,[]
84205,ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመት ...,2024-01-11T21:59:26,[],,“-““““,[]
84207,የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ ግ...,2024-01-12T00:16:19,[],,ℹ,[]
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን ፤...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,"""""""""",[]
84217,በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ አ...,2024-01-12T00:54:13,[#ተገደለ።],,"""""-",[]


#### Extract mentions

In [19]:
df['mentions'] = df['text'].apply(util.extract_mentions)
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links,mentions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
84203,በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” አቶ አበባው አያሌው በአዲ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““,[],[tikvahethiopia]
84205,ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመት ...,2024-01-11T21:59:26,[],,“-““““,[],[tikvahethiopia]
84207,የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ ግ...,2024-01-12T00:16:19,[],,ℹ,[],[tikvahethiopia]
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን ፤...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,"""""""""",[],[tikvahethiopia]
84217,በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ አ...,2024-01-12T00:54:13,[#ተገደለ።],,"""""-",[],[tikvahethiopia]


In [20]:
df['text'] = df['text'].str.replace(util.mention_pattern, '', regex=True).str.strip()
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links,mentions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
84203,በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” አቶ አበባው አያሌው በአዲ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““,[],[tikvahethiopia]
84205,ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመት ...,2024-01-11T21:59:26,[],,“-““““,[],[tikvahethiopia]
84207,የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ ግ...,2024-01-12T00:16:19,[],,ℹ,[],[tikvahethiopia]
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን ፤...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,"""""""""",[],[tikvahethiopia]
84217,በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ አ...,2024-01-12T00:54:13,[#ተገደለ።],,"""""-",[],[tikvahethiopia]


#### Remove extra spaces

In [21]:
df['text'] = df['text'].str.replace('\s+', ' ', regex=True).str.strip()

In [22]:
df['text'] = df['text'].replace(r'!+', '!', regex=True)
df['text'] = df['text'].replace(r'\.+', '', regex=True)

In [23]:
df.tail()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links,mentions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
84203,በእኛ የህግ አማካሪ በኩል ጥፋት ነበረ ” አቶ አበባው አያሌው በአዲስ አ...,2024-01-11T21:34:06,"[#AddisAbaba, #ጥፋት, #‘ታስሮ, #መመሪያውም]",,“-““““““““,[],[tikvahethiopia]
84205,ቀብሩ ዛሬ ተፈፅሟል በቢሾፍቱ ቃጂማ ጊዮርጊስ ቤተክርስቲያን። የ8 ኣመት ...,2024-01-11T21:59:26,[],,“-““““,[],[tikvahethiopia]
84207,የግብፁ መሪ የኤርትራው ፕሬዜዳንት ኢሳያስ አፈወርቂ ግብፅን እንዲጎበኙ ግ...,2024-01-12T00:16:19,[],,ℹ,[],[tikvahethiopia]
84209,የጠቅላይ ሚኒስትሩ የብሄራዊ ደህንነት አማካሪ አምባሳደር ሬድዋን ሁሴን ፤...,2024-01-12T00:19:12,"[#Ethiopia, #ጫና]",,"""""""""",[],[tikvahethiopia]
84217,በምጥ የተያዘችን እናት ሊያመጣ ሲሄድ በተተኮሰበት ጥይት ተመቶ ህይወቱ አ...,2024-01-12T00:54:13,[#ተገደለ።],,"""""-",[],[tikvahethiopia]


#### Save cleaned dataframe

In [24]:
df.to_csv(f"{cleaned_dir}/{file_name}.csv")

#### Save to .txt file

In [25]:
df['text'].to_csv(f"{cleaned_dir}/{file_name}.txt", index=False, header=False)