# Data Cleaning and Transformation

Import Necessarry libraries

In [34]:
# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns 
from wordcloud import WordCloud

# Text Processing 
import re 
import string

In [35]:
# Find main working directories
import sys
import os
sys.path.append(os.path.join(os.path.abspath('..')))

In [36]:
import scripts.data_preprocessing as dp

### Data Loading

In [37]:
df = pd.read_csv('../data/raw/lobelia4cosmetics_data.csv')

In [38]:
df.sample()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path
1503,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10883,Wellbaby multivitamin drop \r\nPrice 3000 birr...,2024-08-07 17:01:00+00:00,photos\@lobelia4cosmetics_10883.jpg


#### 1. Data Cleaning

#### Explore the Data

In [10]:
# Structure of the data
df.shape

(2332, 6)

In [11]:
# Data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2332 entries, 0 to 2331
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel Title     2332 non-null   object
 1   Channel Username  2332 non-null   object
 2   ID                2332 non-null   int64 
 3   Message           2243 non-null   object
 4   Date              2332 non-null   object
 5   Media Path        2330 non-null   object
dtypes: int64(1), object(5)
memory usage: 109.4+ KB


##### 1.1 Removing Duplicates

In [12]:
# Check Duplicated
df.duplicated().sum()

np.int64(0)

##### 1.2 Handling Missing Values
We can use the pandas library to check for missing values in the data.

In [13]:
# Check for missing values
print(df.isnull().sum())

Channel Title        0
Channel Username     0
ID                   0
Message             89
Date                 0
Media Path           2
dtype: int64


In [14]:
# Drop Missing values
df.dropna(inplace=True)

##### 1.3 Standardizing Formats
We can use the pandas library to standardize the formats of the data. For example, we can convert the date column to a standard date format.

In [15]:
# Convert the date column to a standard date format
df['Date'] = pd.to_datetime(df['Date'])

##### 1.4 Data Validation
We can use the pandas library to validate the data. For example, we can check if the ID column contains only unique values.

In [16]:
# Check if the ID column contains only unique values
print(df['ID'].nunique() == len(df))

True


##### 2. Data Transformation

In [17]:
# Lowecase the message column
df['Message'] = df['Message'].apply(lambda x: x.lower())

##### 2.1 Extracting Product Name:
 Use this concept when you need to extract the name of a product from a given message. This can be useful when you need to identify the product being sold or promoted.
##### 2.2 Extracting Weight: 
Use this concept when you need to extract the weight of a product from a given message. This can be useful when you need to calculate shipping costs or determine the quantity of a product.

In [18]:
# Apply the function and create new columns
df[['Product_Name', 'Weight']] = df['Message'].apply(lambda x: pd.Series(dp.extract_product_name(x)))

In [19]:
df.sample(3)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight
164,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,12257,nido usa 2.2kg\r\nprice 5900 birr \r\ntelegram...,2024-10-06 05:22:55+00:00,photos\@lobelia4cosmetics_12257.jpg,nido usa,2.2kg
930,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11469,enfamil neuro 565gm \r\nprice 5500 birr \r\nte...,2024-08-26 06:41:31+00:00,photos\@lobelia4cosmetics_11469.jpg,enfamil neuro m,565g
892,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11509,pregnacare breastfeeding 84 tablets \r\nprice ...,2024-08-28 09:49:35+00:00,photos\@lobelia4cosmetics_11509.jpg,pregnacare breastfeeding 84 tablets,


##### 2.3 Extracting Price: 
Use this concept when you need to extract the price of a product from a given message. This can be useful when you need to calculate the total cost of a purchase or compare prices between different sellers.

In [20]:
# Apply the extraction function to the 'Message' column
df['Price'] = df['Message'].apply(dp.extract_price)

In [21]:
df.sample(3)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight,Price
426,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11981,neutrogena rapid tone repair \r\nprice 3000 bi...,2024-09-20 13:48:49+00:00,photos\@lobelia4cosmetics_11981.jpg,neutrogena rapid tone repair,,3000
330,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,12079,day and night quil /super c\r\nprice 2500 birr...,2024-09-25 06:13:13+00:00,photos\@lobelia4cosmetics_12079.jpg,day and night quil /super c,,2500
1793,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10586,lindor 900 gm\r\nprice 5000 birr \r\ntelegram ...,2024-07-28 07:38:09+00:00,photos\@lobelia4cosmetics_10586.jpg,lindor 900 gm,,5000


##### 2.4 Telegram Address: 
We can use regular expressions to extract the Telegram address from the message column. For example, we can look for strings that start with "https://t.me/".

In [22]:
# Apply the extraction function to the 'Message' column
df['Telegram Address'] = df['Message'].apply(lambda x: pd.Series(dp.extract_telegram_address(x)))


In [23]:
df.sample()

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight,Price,Telegram Address
883,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11518,thylanol 225 caplets\r\nprice 4000 birr \r\nte...,2024-08-28 09:49:36+00:00,photos\@lobelia4cosmetics_11518.jpg,thylanol 225 caplets,,4000,https://t.me/lobelia4cosmetics


##### 2.5 Extracting Address: 
Use this concept when you need to extract an address from a given message. This can be useful when you need to ship a product or provide directions to a physical location.

In [24]:
# Apply the extraction function to the 'Message' column
df['Address'] = df['Message'].apply(lambda x: pd.Series(dp.extract_address(x)))

In [25]:
df.sample(3)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight,Price,Telegram Address,Address
242,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,12173,day and night quil /super c\r\nprice 2500 birr...,2024-10-01 06:11:49+00:00,photos\@lobelia4cosmetics_12173.jpg,day and night quil /super c,,2500,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school
812,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11589,mini drops immune plus \r\nprice 2800 birr \r\...,2024-08-31 06:28:00+00:00,photos\@lobelia4cosmetics_11589.jpg,mini drops immune plus,,2800,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school
948,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11451,eucerin pigment control \r\nprice 4500 birr \r...,2024-08-26 06:24:44+00:00,photos\@lobelia4cosmetics_11451.jpg,eucerin pigment control,,4500,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school


#### 2.6 Extracting Phone Number: 
Use this concept when you need to extract a phone number from a given message. This can be useful when you need to contact a customer or seller.

In [26]:
# Apply the extraction function to the 'Message' column
df['Phone Number'] = df['Message'].apply(dp.extract_phone_number)

In [27]:
df.sample(3)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight,Price,Telegram Address,Address,Phone Number
1423,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10965,chia seeds \r\nprice 3000 birr \r\ntelegram ht...,2024-08-09 14:12:45+00:00,photos\@lobelia4cosmetics_10965.jpg,chia seeds,,3000,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,911562031
790,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11611,glycolic acid \r\nprice 4000 birr \r\ntelegram...,2024-09-02 09:47:24+00:00,photos\@lobelia4cosmetics_11611.jpg,glycolic acid,,4000,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,911562031
2154,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10218,chocolate \r\nprice 300 birr each \r\ntelegram...,2024-07-18 13:34:41+00:00,photos\@lobelia4cosmetics_10218.jpg,chocolate,,300,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,911562031


##### 2.7 Extracting Open Day and Time: 
Use this concept when you need to extract the days and hours of operation for a business. This can be useful when you need to plan a visit or contact a business during their operating hours.

In [28]:
# Apply the extraction function to the 'Message' column
df['Open_Day_and_Time'] = df['Message'].apply(dp.extract_open_day_and_time)

In [29]:
# Set display options
pd.set_option('display.max_rows', 3)
pd.set_option('display.max_columns', None)

# Sample 5 rows from the DataFrame
df_sample = df.sample(5)
df_sample

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight,Price,Telegram Address,Address,Phone Number,Open_Day_and_Time
192,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,12225,suave kids wash \r\nprice 3200 birr \r\ntelegr...,2024-10-04 13:16:12+00:00,photos\@lobelia4cosmetics_12225.jpg,suave kids wash,,3200,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,0911562031,open monday - monday from 8am until midnight ከ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10319,nido usa 2.2kg\r\nprice 5900 birr \r\ntelegram...,2024-07-20 13:36:07+00:00,photos\@lobelia4cosmetics_10319.jpg,nido usa,2.2kg,5900,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,0911562031,open monday - monday from 8am until midnight ከ...


##### 2.8 Extracting Delivery Fee: 
Use this concept when you need to extract the delivery fee for a product from a given message. This can be useful when you need to calculate the total cost of a purchase or compare delivery fees between different sellers.

In [30]:
# Apply the extraction function to the 'Message' column
df['Delivery Fee'] = df['Message'].apply(lambda x: pd.Series(dp.extract_delivery_fee(x)))

In [31]:
df.sample(3)

Unnamed: 0,Channel Title,Channel Username,ID,Message,Date,Media Path,Product_Name,Weight,Price,Telegram Address,Address,Phone Number,Open_Day_and_Time,Delivery Fee
1424,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10964,roc deep wrinkle \r\nprice 6000 birr \r\nteleg...,2024-08-09 14:12:45+00:00,photos\@lobelia4cosmetics_10964.jpg,roc deep wrinkle,,6000,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,911562031,open monday - monday from 8am until midnight ከ...,70 - 200 birr
1017,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,11380,chia seeds \r\nprice 3000 birr \r\ntelegram ht...,2024-08-24 12:40:24+00:00,photos\@lobelia4cosmetics_11380.jpg,chia seeds,,3000,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,911562031,open monday - monday from 8am until midnight ከ...,70 - 200 birr
1496,Lobelia pharmacy and cosmetics,@lobelia4cosmetics,10891,pregnacare breastfeeding 84 tablets \r\nprice ...,2024-08-08 12:32:37+00:00,photos\@lobelia4cosmetics_10891.jpg,pregnacare breastfeeding 84 tablets,,5000,https://t.me/lobelia4cosmetics,adress:- infront of bole medhanialem high school,911562031,open monday - monday from 8am until midnight ከ...,70 - 200 birr


##### 2.9 Remove Emoji From Message Data 

In [32]:
df['Message'] = df['Message'].apply(dp.remove_emojis)

In [33]:
# Save the preprocessed data to csv file
df.to_csv('../data/preprocessed/preprocessed.csv', index=False)