In [5]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:

# Load the dataset
data = pd.read_excel(r"D:\datasets\ArzEn-MultiGenre-version-1.xlsx")


In [6]:
def handle_whitespace_and_empty_strings(data):
    # Handle whitespace and empty strings
    data = data.apply(lambda x: x.strip() if isinstance(x, str) else x)
    print("Whitespace & Empty Strings Handled.")

In [7]:
def identify_missing_values(data):
    # Identify missing values
    missing_values = data.isnull().sum()
    print("Missing Values:")
    print(missing_values)
    
    # Check percentage of missing values
    missing_percentage = (missing_values / len(data)) * 100
    
    # Decide strategy for handling missing data
    # Example: Delete rows with a high percentage of missing values
    high_missing_percentage = missing_percentage[missing_percentage > 30]
    if not high_missing_percentage.empty:
        print("Deleting rows with high missing value percentage...")
        data.dropna(subset=high_missing_percentage.index, inplace=True)


In [8]:
def remove_duplicates(data):
    # Remove duplicates
    data.drop_duplicates(inplace=True)
    print("Duplicates Removed.")


In [9]:
def remove_repeated_characters(data):
    # Remove repeated characters
    data = data.apply(lambda x: re.sub(r'(.)\1+', r'\1', x) if isinstance(x, str) else x)
    print("Repeated Characters Removed.")

In [13]:
def remove_meaningless_words(data):
    # Remove stop words using NLTK
    stop_words = set(stopwords.words('english'))
    data['english_Text'] = data['english_Text'].apply(lambda x: ' '.join([word for word in word_tokenize(str(x)) if word.lower() not in stop_words]))
    print("Meaningless Words Removed.")


In [11]:
def handle_multilingual_sentences(data):
    # Handle sentences containing both languages
    # Example: Split sentences into separate columns for each language
    # Assuming the data is stored in a DataFrame with separate columns for English and Arabic
    english_regex = r'[a-zA-Z]'
    arabic_regex = r'[\u0600-\u06FF]'
    
    def split_multilingual(row):
        if re.search(english_regex, str(row['english_Text'])) and re.search(arabic_regex, str(row['egyption_Text'])):
            return pd.Series({'english_Text': re.sub(arabic_regex, '', str(row['english_Text'])), 
                              'egyption_Text': re.sub(english_regex, '', str(row['egyption_Text']))})
        else:
            return row
    
    data = data.apply(split_multilingual, axis=1)
    print("Multilingual Sentences Handled.")

In [14]:
# Task execution
identify_missing_values(data)
remove_duplicates(data)
handle_whitespace_and_empty_strings(data)
remove_repeated_characters(data)
remove_meaningless_words(data)
handle_multilingual_sentences(data)

Missing Values:
egyption_Text     84
english_Text     371
category           0
sub_category       0
dtype: int64
Duplicates Removed.
Whitespace & Empty Strings Handled.
Repeated Characters Removed.
Meaningless Words Removed.
Multilingual Sentences Handled.


In [15]:
print(data.columns)

Index(['egyption_Text', 'english_Text', 'category', 'sub_category'], dtype='object')


In [16]:
print(data.describe())

       egyption_Text english_Text   category  sub_category
count          24120        24204      24204         24204
unique         16674        19847          3           307
top             إيه؟          nan  Subtitles  the-stranger
freq              31          371      15924          1953


In [17]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24204 entries, 0 to 26073
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   egyption_Text  24120 non-null  object
 1   english_Text   24204 non-null  object
 2   category       24204 non-null  object
 3   sub_category   24204 non-null  object
dtypes: object(4)
memory usage: 945.5+ KB
None


In [18]:
# Output cleaned data to a new Excel file
data.to_excel("cleaned_data(Arzen)by MA2.xlsx", index=False)
print("\ncleaned_data(Arzen)by MA2.xlsx'.")


cleaned_data(Arzen)by MA2.xlsx'.


In [2]:
# Load the dataset
import pandas as pd
file_path = r"D:\datasets\cleaned_data(Arzen)by MA22.xlsx"
data = pd.read_excel(file_path)

In [3]:
# Display information about missing values before dropping
print("Missing Values (Before):")
print(data.isnull().sum())

Missing Values (Before):
egyption_Text     18
english_Text     111
category           0
sub_category       0
dtype: int64


In [4]:

# Drop rows containing NaN values
data.dropna(inplace=True)

In [5]:
# Display information about missing values after dropping
print("\nMissing Values (After):")
print(data.isnull().sum())



Missing Values (After):
egyption_Text    0
english_Text     0
category         0
sub_category     0
dtype: int64


In [6]:
# Save the preprocessed dataset to a new file
preprocessed_file_path = "preprocessed_data(Smsm).xlsx"
data.to_excel(preprocessed_file_path, index=False)
print("\nPreprocessed data saved to:", preprocessed_file_path)


Preprocessed data saved to: preprocessed_data(Smsm).xlsx
