In [118]:
# import necessarry libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [119]:
# Add the parent directory to the system path
import os 
import sys 
sys.path.append(os.path.abspath(os.path.join('..')))

In [120]:
# importing modules from scripts
import scripts.data_preprocessing as p

In [121]:
# load mertteka telgram channel data
df = pd.read_csv('../data/telegram_data.csv')

In [122]:
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,ምርጥ ዕቃ,@MerttEka,6805,📣 Car Aromatherapy Solar Vortex\n\n📎 ይሄንን👉 t.m...,2024-09-26 16:38:22+00:00,
1,ምርጥ ዕቃ,@MerttEka,6804,,2024-09-26 16:16:51+00:00,photos\@MerttEka_6804.jpg
2,ምርጥ ዕቃ,@MerttEka,6803,,2024-09-26 16:16:51+00:00,photos\@MerttEka_6803.jpg
3,ምርጥ ዕቃ,@MerttEka,6802,📣 Car Aromatherapy Solar Vortex\n\n✔️ የመኪና መዓዛ...,2024-09-26 16:16:51+00:00,photos\@MerttEka_6802.jpg
4,ምርጥ ዕቃ,@MerttEka,6801,🔠🔠🔠🔠 🔠🔠🔠🔠🔠\n📣 GW HAIR DRYER/Blower\n\n✔️ የፀጉር ...,2024-09-26 12:31:45+00:00,photos\@MerttEka_6801.jpg


In [123]:
# List all columns in the dataset 
print(df.columns)

Index(['Channel Title', 'Channel Username', 'ID', 'Message', 'Date',
       'Media Path'],
      dtype='object')


In [124]:
# Datastructure of each column 
df.dtypes

Channel Title       object
Channel Username    object
ID                   int64
Message             object
Date                object
Media Path          object
dtype: object

### Data Quality Assesment 

In [125]:
# check missing values
nan_counts = p.check_missing_values(df)
print(f"Numer of nan values in the df is {nan_counts}")

Numer of nan values in the df is                   Missing Values  % of Total Values Data Types
Message                     1552              27.19     object
Media Path                  1183              20.72     object
Channel Username               0               0.00     object
Channel Title                  0               0.00     object
ID                             0               0.00      int64
Date                           0               0.00     object


In [126]:
# percentage of missing values 
p.caculate_missing_percentage(df)

The dataset has 7.98% missing values.


In [127]:
# check for duplicate 
print(f"Numer of duplicate rows: {df.duplicated().sum()}")

Numer of duplicate rows: 0


### Handling Missing values
* for this analysis we will focus on message column
* we will drop the missing rows in the message columns 

In [128]:
# Drop rows with missing values in the Message columns 
df = df.dropna(subset=['Message'])

In [129]:
# confirm na is removed 
message_cleaned = df['Message'].isnull().sum()
print(f"Numer of NAN values in Message columns is {message_cleaned}")

Numer of NAN values in Message columns is 0


In [130]:
# Data shape 
print(f"Data shape after droping the null values in the message column is {df.shape}")

Data shape after droping the null values in the message column is (4157, 6)


In [131]:
df.head()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,ምርጥ ዕቃ,@MerttEka,6805,📣 Car Aromatherapy Solar Vortex\n\n📎 ይሄንን👉 t.m...,2024-09-26 16:38:22+00:00,
3,ምርጥ ዕቃ,@MerttEka,6802,📣 Car Aromatherapy Solar Vortex\n\n✔️ የመኪና መዓዛ...,2024-09-26 16:16:51+00:00,photos\@MerttEka_6802.jpg
4,ምርጥ ዕቃ,@MerttEka,6801,🔠🔠🔠🔠 🔠🔠🔠🔠🔠\n📣 GW HAIR DRYER/Blower\n\n✔️ የፀጉር ...,2024-09-26 12:31:45+00:00,photos\@MerttEka_6801.jpg
5,ምርጥ ዕቃ,@MerttEka,6800,📣 2 in 1 Porcelain Dessert Bowel\n\n✔️ የሰላጣ እ...,2024-09-26 09:22:02+00:00,photos\@MerttEka_6800.jpg
10,ምርጥ ዕቃ,@MerttEka,6795,📣 Plastic And Metal Cubic Cloth Cabinet\n\n✔️ ...,2024-09-26 08:31:44+00:00,photos\@MerttEka_6795.jpg


Preprocess text data by tokenizing, normalizing, and handling Amharic-specific linguistic features.

In [132]:
# Apply remove emoji function to the message column
df['Message'] = df['Message'].apply(p.remove_emojis).apply(p.remove_english_words)

# Display the cleaned dataframe 
print(df.head())

   Channel Title Channel Username    ID  \
0         ምርጥ ዕቃ        @MerttEka  6805   
3         ምርጥ ዕቃ        @MerttEka  6802   
4         ምርጥ ዕቃ        @MerttEka  6801   
5         ምርጥ ዕቃ        @MerttEka  6800   
10        ምርጥ ዕቃ        @MerttEka  6795   

                                              Message  \
0                 \n\n ይሄንን ./  ተጭነው   ያድርጉ፣ ቤተሰብ ይሁኑ   
3       \n\n የመኪና መዓዛ \n በፀሃይ ብርሃን ስለሚሰራ ቻርጅ ማድረግ ...   
4    \n   /\n\n የፀጉር ማድረቂያ ፎን\n6000watt በጣም ፈጣን\nማ...   
5    2  1    \n\n የሰላጣ እና የቺፕስ ማቅረቢያ\n\n 2400 ብር\n...   
10        \n\n ዘመናዊ የልብስ ቁም ሳጥን\n በቀላሉ የሚገጣጠም የሚነቃ...   

                         Date                 Media Path  
0   2024-09-26 16:38:22+00:00                        NaN  
3   2024-09-26 16:16:51+00:00  photos\@MerttEka_6802.jpg  
4   2024-09-26 12:31:45+00:00  photos\@MerttEka_6801.jpg  
5   2024-09-26 09:22:02+00:00  photos\@MerttEka_6800.jpg  
10  2024-09-26 08:31:44+00:00  photos\@MerttEka_6795.jpg  


In [133]:
# save the updated data to csv file 
df.to_csv('../data/cleaned_data.csv')

In [134]:
df

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
0,ምርጥ ዕቃ,@MerttEka,6805,\n\n ይሄንን ./ ተጭነው ያድርጉ፣ ቤተሰብ ይሁኑ,2024-09-26 16:38:22+00:00,
3,ምርጥ ዕቃ,@MerttEka,6802,\n\n የመኪና መዓዛ \n በፀሃይ ብርሃን ስለሚሰራ ቻርጅ ማድረግ ...,2024-09-26 16:16:51+00:00,photos\@MerttEka_6802.jpg
4,ምርጥ ዕቃ,@MerttEka,6801,\n /\n\n የፀጉር ማድረቂያ ፎን\n6000watt በጣም ፈጣን\nማ...,2024-09-26 12:31:45+00:00,photos\@MerttEka_6801.jpg
5,ምርጥ ዕቃ,@MerttEka,6800,2 1 \n\n የሰላጣ እና የቺፕስ ማቅረቢያ\n\n 2400 ብር\n...,2024-09-26 09:22:02+00:00,photos\@MerttEka_6800.jpg
10,ምርጥ ዕቃ,@MerttEka,6795,\n\n ዘመናዊ የልብስ ቁም ሳጥን\n በቀላሉ የሚገጣጠም የሚነቃ...,2024-09-26 08:31:44+00:00,photos\@MerttEka_6795.jpg
...,...,...,...,...,...,...
5685,ምርጥ ዕቃ,@MerttEka,44,2500,2019-11-13 10:01:12+00:00,
5686,ምርጥ ዕቃ,@MerttEka,43,ዋጋ 2500\n0983063957,2019-11-13 10:00:40+00:00,
5690,ምርጥ ዕቃ,@MerttEka,32,የሙያ ባለቤት መሆን መሠልጠን ነው።\nቀለም ቀቢ ሳያስፈልግዎ ቤትዎን ወይ...,2019-11-09 04:44:54+00:00,
5692,ምርጥ ዕቃ,@MerttEka,30,ቤትና ግቢዎን እንዲሁም የብረት እና የእንጨት ቁሳቁስዎን ቀለም ቀቢ ሳያስ...,2019-10-31 07:06:25+00:00,


In [135]:
df['Message'].head()

0                   \n\n ይሄንን ./  ተጭነው   ያድርጉ፣ ቤተሰብ ይሁኑ
3         \n\n የመኪና መዓዛ \n በፀሃይ ብርሃን ስለሚሰራ ቻርጅ ማድረግ ...
4      \n   /\n\n የፀጉር ማድረቂያ ፎን\n6000watt በጣም ፈጣን\nማ...
5      2  1    \n\n የሰላጣ እና የቺፕስ ማቅረቢያ\n\n 2400 ብር\n...
10          \n\n ዘመናዊ የልብስ ቁም ሳጥን\n በቀላሉ የሚገጣጠም የሚነቃ...
Name: Message, dtype: object