In [1]:
import os
import pandas as pd
import re
# Directories


In [2]:

# --- Configuration ---
# IMPORTANT: This path MUST match the DATA_FOLDER used in your scraping script.
# It should point to the directory where your individual channel CSV files are saved.
DATA_FOLDER = r'D:\kaimtenx\project\week4\Amharic-Ecommerce-Data-Extractor\data'


In [3]:
# --- Main Merging Logic ---

def merge_scraped_data(data_folder):
    """
    Merges all individual CSV files (assumed to be from channel scrapes)
    in the specified folder into a single cleaned_data.csv file.
    """
    print(f"Starting data merging process from: {data_folder}")
    
    if not os.path.isdir(data_folder):
        print(f"Error: Data folder '{data_folder}' does not exist. Please ensure your scraping script ran successfully.")
        return

    all_scraped_dataframes = []
    
    # List all files in the data folder
    for filename in os.listdir(data_folder):
        # Check if it's a CSV file and likely an individual channel data file
        if filename.endswith('_data.csv'):
            file_path = os.path.join(data_folder, filename)
            try:
                df = pd.read_csv(file_path, encoding='utf-8')
                all_scraped_dataframes.append(df)
                print(f"Loaded '{filename}' ({len(df)} rows)")
            except Exception as e:
                print(f"Warning: Could not load '{filename}'. Error: {e}")

    if all_scraped_dataframes:
        print("\n--- Concatenating all loaded dataframes ---")
        merged_df = pd.concat(all_scraped_dataframes, ignore_index=True)
        print(f"Initial merged dataframe size: {len(merged_df)} rows")

        # --- Apply Basic Preprocessing (similar to original script) ---
        print("Applying basic post-merge preprocessing...")

        # Convert 'date' column to datetime objects
        # Use errors='coerce' to turn unparseable dates into NaT (Not a Time)
        merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')
        print("Converted 'date' column to datetime.")

        # Drop 'sender_id' if it's mostly empty (assuming it's consistently empty or not useful)
        # Check if column exists and if all its values are null
        if 'sender_id' in merged_df.columns and merged_df['sender_id'].isnull().all():
            merged_df = merged_df.drop(columns=['sender_id'])
            print("Dropped 'sender_id' column as it was entirely empty in all merged data.")
        elif 'sender_id' in merged_df.columns and merged_df['sender_id'].isnull().sum() > 0:
             print(f"Note: 'sender_id' column has {merged_df['sender_id'].isnull().sum()} missing values but not all. Keeping column.")


        # Basic tokenization for the 'message' column
        # This is a simple split-and-strip; for robust Amharic NLP, consider specialized libraries
        merged_df['tokens'] = merged_df['message'].apply(
            lambda x: [word.strip() for word in str(x).replace('\n', ' ').split() if word.strip()]
            if pd.notna(x) else []
        )
        print("Basic tokenization applied to 'message' column, creating 'tokens'.")
        
        # Ensure 'channel' is string type (if it somehow became numeric from IDs)
        merged_df['channel'] = merged_df['channel'].astype(str)

        # Define the output path for the merged file
        merged_file_path = os.path.join(data_folder, 'cleaned_data.csv')
        merged_df.to_csv(merged_file_path, index=False, encoding='utf-8')
        print(f"\nSuccessfully merged all data and saved to '{merged_file_path}'")
        print(f"Total rows in final merged file: {len(merged_df)}")
    else:
        print("\nNo individual channel CSV files were found to merge. Please run the scraping script first.")

if __name__ == '__main__':
    merge_scraped_data(DATA_FOLDER)

Starting data merging process from: D:\kaimtenx\project\week4\Amharic-Ecommerce-Data-Extractor\data
Loaded 'Fashiontera_data.csv' (1975 rows)
Loaded 'meneshayeofficial_data.csv' (876 rows)
Loaded 'shageronlinestore_data.csv' (4061 rows)
Loaded 'Shewabrand_data.csv' (2702 rows)
Loaded 'ZemenExpress_data.csv' (4839 rows)

--- Concatenating all loaded dataframes ---
Initial merged dataframe size: 14453 rows
Applying basic post-merge preprocessing...
Converted 'date' column to datetime.
Basic tokenization applied to 'message' column, creating 'tokens'.

Successfully merged all data and saved to 'D:\kaimtenx\project\week4\Amharic-Ecommerce-Data-Extractor\data\cleaned_data.csv'
Total rows in final merged file: 14453


In [None]:
#load cleaned data
