In [272]:
# ================================================================
# CELL 1: Imports & Setup
# ================================================================

import pandas as pd
import random
import re
from pathlib import Path
from sklearn.model_selection import train_test_split

print("Libraries imported successfully.")


Libraries imported successfully.


In [274]:
# ================================================================
# CELL 2: SimpleSMSAugmenter Class
# ================================================================

class SimpleSMSAugmenter:
    def __init__(self):
        pass
    
    def sms_specific_augmentation(self, text, label):
        augmented_texts = [text]
        
        if label == 0:  # ham
            variations = [
                text.lower(),
                self.add_typos(text),
                self.add_emojis(text),
                self.shorten_words(text),
                self.add_sms_abbreviations(text)
            ]
        else:  # smish
            variations = [
                text,
                self.vary_urgency_level(text),
                self.change_offer_amounts(text),
                self.modify_urls_numbers(text)
            ]
        
        augmented_texts.extend([v for v in variations if v != text and len(v) > 10])
        return list(set(augmented_texts))
    
    def add_typos(self, text):
        common_typos = {
            'the': 'teh', 'you': 'u', 'your': 'ur', 'are': 'r',
            'see': 'c', 'why': 'y', 'please': 'pls', 'thanks': 'thx',
            'before': 'b4', 'great': 'gr8', 'late': 'l8', 'message': 'msg'
        }
        words = text.split()
        if random.random() < 0.3:
            for i, word in enumerate(words):
                if word.lower() in common_typos and random.random() < 0.4:
                    words[i] = common_typos[word.lower()]
        return ' '.join(words)
    
    def add_emojis(self, text):
        emojis = ['üòä', 'üòÇ', 'üëç', '‚ù§Ô∏è', 'üôè', 'üòç', 'üòé', 'ü§î', 'üëã', 'üéâ']
        if random.random() < 0.4:
            return text + ' ' + random.choice(emojis)
        return text
    
    def shorten_words(self, text):
        abbreviations = {
            'because': 'cuz', 'tomorrow': 'tmrw', 'today': '2day',
            'tonight': '2nite', 'for you': '4u', 'see you': 'cu',
            'by the way': 'btw', 'oh my god': 'omg', 'laughing out loud': 'lol',
            'be right back': 'brb', 'talk to you later': 'ttyl'
        }
        
        lower_text = text.lower()
        for full, short in abbreviations.items():
            if full in lower_text and random.random() < 0.3:
                text = re.sub(full, short, text, flags=re.IGNORECASE)
        return text
    
    def add_sms_abbreviations(self, text):
        abbreviations = ['lol', 'brb', 'omg', 'tbh', 'idk', 'smh', 'imo', 'btw']
        if random.random() < 0.3:
            return text + ' ' + random.choice(abbreviations)
        return text
    
    def vary_urgency_level(self, text):
        urgency_phrases = [
            'URGENT!', 'IMPORTANT!', 'ACT NOW!', 'LAST CHANCE!',
            'FINAL NOTICE!', 'TIME SENSITIVE!', "DON'T MISS OUT!"
        ]
        if random.random() < 0.4:
            return random.choice(urgency_phrases) + ' ' + text
        return text
    
    def change_offer_amounts(self, text):
        text = re.sub(r'\$\d+', f'${random.randint(10, 1000)}', text)
        text = re.sub(r'\d+\s*pounds', f'{random.randint(50, 5000)} pounds', text)
        text = re.sub(r'\d+\s*OMR', f'{random.randint(1, 100)} OMR', text)
        return text
    
    def modify_urls_numbers(self, text):
        text = re.sub(r'\d{10,12}', ''.join([str(random.randint(0, 9)) for _ in range(10)]), text)
        text = re.sub(r'http://\S+|https://\S+', 'http://example.com', text)
        text = re.sub(r'www\.\S+', 'www.example.com', text)
        return text

    def augment_dataset(self, df, target_ham_count=6000, target_smish_count=3000):
        """
        df must have columns: 'text' and numeric 'label' (0=ham, 1=smish).
        If target_count < current_count, that class is not upsampled.
        """
        augmented_data = []
        
        ham_df = df[df['label'] == 0]
        smish_df = df[df['label'] == 1]
        
        print(f"Original - Ham: {len(ham_df)}, Smish: {len(smish_df)}")
        
        augmented_ham = self.augment_class(ham_df, target_ham_count, 'ham')
        augmented_smish = self.augment_class(smish_df, target_smish_count, 'smish')
        
        final_df = pd.DataFrame(augmented_ham + augmented_smish)
        print(f"Augmented - Ham: {len(augmented_ham)}, Smish: {len(augmented_smish)}")
        
        return final_df
    
    def augment_class(self, class_df, target_count, label_name):
        augmented_data = []
        label = 0 if label_name == 'ham' else 1
        
        # keep original examples
        for _, row in class_df.iterrows():
            augmented_data.append({'text': row['text'], 'label': label})
        
        needed = target_count - len(class_df)
        
        if needed > 0:
            print(f"Augmenting {label_name} class: generating {needed} additional samples")
            
            for _ in range(needed):
                base_sample = class_df.sample(1).iloc[0]
                base_text = base_sample['text']
                augmented_texts = self.sms_specific_augmentation(base_text, label)
                augmented_text = random.choice(augmented_texts)
                
                if augmented_text != base_text and len(augmented_text) > 10:
                    augmented_data.append({'text': augmented_text, 'label': label})
        
        return augmented_data


In [276]:
# ================================================================
# CELL 3: Generate Business Messages
# ================================================================

def generate_business_messages(num_messages=1500):
    business_templates = [
        "Your {service} subscription for {amount} will renew on {date}. Manage in app.",
        "Reminder: Your {service} plan expires on {date}. Update payment method.",
        "Your {service} billing cycle ends {date}. Next charge: {amount}.",
        "Your {bank} statement for {month} is ready. View online.",
        "Payment of {amount} processed. Available balance: {balance}.",
        "Security alert: New login to your {bank} account.",
        "Dear students, {assignment} deadline extended to {date}.",
        "Academic notice: {course} grades posted on student portal.",
        "University update: {event} scheduled for {date}.",
        "Order #{order_id} shipped! Track your delivery.",
        "Delivery update: Package arriving today {time_range}.",
        "Your return for order #{order_id} processed. Refund: {amount}.",
        "Appointment reminder: {appointment_type} on {date} at {time}.",
        "Prescription ready at {pharmacy}. Ref #: {ref_number}.",
        "Test results for {test_name} available in patient portal."
    ]
    
    services = ['Netflix', 'Spotify', 'Amazon Prime', 'YouTube Premium']
    banks = ['Chase', 'Bank of America', 'Wells Fargo', 'Citibank']
    amounts = ['$9.99', '$14.99', '$19.99', '$29.99']
    dates = ['Nov 25, 2024', 'Dec 1, 2024', 'Jan 15, 2025']
    
    generated = []
    
    for _ in range(num_messages):
        template = random.choice(business_templates)
        msg = template.format(
            service=random.choice(services),
            amount=random.choice(amounts),
            date=random.choice(dates),
            bank=random.choice(banks),
            month=random.choice(['October', 'November', 'December']),
            balance=random.choice(['$1,234.56', '$5,678.90']),
            assignment=random.choice(['midterm project', 'final paper']),
            course=random.choice(['Computer Science', 'Mathematics']),
            event=random.choice(['career fair', 'guest lecture']),
            order_id=f"{random.choice(['A','B','C'])}{random.randint(100000,999999)}",
            time_range=random.choice(['2-4 PM', '10 AM-12 PM']),
            appointment_type=random.choice(['dental cleaning', 'physical']),
            time=random.choice(['10:00 AM', '2:30 PM']),
            pharmacy=random.choice(['CVS', 'Walgreens']),
            ref_number=f"RX{random.randint(100000,999999)}",
            test_name=random.choice(['blood work', 'urinalysis'])
        )
        generated.append({'text': msg, 'label': 0})
    
    return pd.DataFrame(generated)

In [278]:
# ================================================================
# CELL 4: Load Cleaned Combined Dataset
# ================================================================

def load_cleaned_combined(path="combined_cleaned_original_datasets.csv"):
    """
    Load the pre-cleaned combined dataset.
    Ensures we end up with columns: 'text' and numeric 'label' (0,1).
    """
    p = Path(path)
    if not p.exists():
        print(f"‚ùå File not found: {p}")
        return None
    
    df = pd.read_csv(p)
    print(f"Loaded combined dataset from {p} with shape {df.shape}")
    print("Columns:", df.columns.tolist())
    
    # Prefer numeric column if present
    if "label_num" in df.columns:
        df["label"] = df["label_num"]
    
    # Keep only needed columns
    df = df[["text", "label"]].copy()
    
    # Safety: ensure labels are 0/1 ints
    df["label"] = df["label"].astype(int)
    
    print("\nLabel distribution in base cleaned dataset:")
    print(df["label"].value_counts().rename({0: "ham", 1: "smish"}))
    
    return df


In [280]:
# ================================================================
# CELL 5: Analyze URL Distribution in Dataset
# ================================================================

def analyze_urls_in_dataset(df):
    """Analyze URL patterns in your dataset (ham vs smish)."""
    
    url_pattern = re.compile(r'http://\S+|https://\S+|www\.\S+', re.IGNORECASE)
    
    ham_with_url = ham_without_url = 0
    smish_with_url = smish_without_url = 0
    
    ham_urls = []
    smish_urls = []
    
    for _, row in df.iterrows():
        urls = re.findall(url_pattern, row['text'])
        has_url = len(urls) > 0
        
        if row['label'] == 0:  # ham
            if has_url:
                ham_with_url += 1
                ham_urls.extend(urls)
            else:
                ham_without_url += 1
        else:  # smish
            if has_url:
                smish_with_url += 1
                smish_urls.extend(urls)
            else:
                smish_without_url += 1
    
    total_ham = ham_with_url + ham_without_url
    total_smish = smish_with_url + smish_without_url
    
    print("="*60)
    print("üìä URL DISTRIBUTION IN DATASET")
    print("="*60)
    if total_ham > 0:
        print(f"HAM with URL:      {ham_with_url:4d} ({ham_with_url/total_ham*100:.1f}%)")
        print(f"HAM without URL:   {ham_without_url:4d} ({ham_without_url/total_ham*100:.1f}%)")
    if total_smish > 0:
        print(f"SMISH with URL:    {smish_with_url:4d} ({smish_with_url/total_smish*100:.1f}%)")
        print(f"SMISH without URL: {smish_without_url:4d} ({smish_without_url/total_smish*100:.1f}%)")
    print("="*60)
    
    print("\nüìé Sample HAM URLs (first 5):")
    for url in list(set(ham_urls))[:5]:
        print(f"  - {url}")
    
    print("\nüö® Sample SMISHING URLs (first 5):")
    for url in list(set(smish_urls))[:5]:
        print(f"  - {url}")
    
    smish_url_percentage = smish_with_url / total_smish if total_smish > 0 else 0.0

    '''
    print("\n" + "="*60)
    if smish_url_percentage > 0.5:
        print("‚úÖ RECOMMENDATION: Dataset has plenty of smishing URLs (>50%)")
        print("üìå Focus on adding HAM-with-URL examples only")
    else:
        print("‚ö†Ô∏è RECOMMENDATION: Dataset has few smishing URLs (<50%)")
        print("üìå Consider adding both HAM-with-URL and SMISHING-with-URL")
    print("="*60)
    '''
    
    return {
        'ham_with_url': ham_with_url,
        'ham_without_url': ham_without_url,
        'smish_with_url': smish_with_url,
        'smish_without_url': smish_without_url,
        'smish_url_percentage': smish_url_percentage
    }


In [282]:
# ================================================================
# CELL 6: URL-removal augmentation for smish
# ================================================================

def apply_smish_url_removal(df):
    """
    For smish messages that contain URLs, create extra versions
    where the URL is removed, to get smish-without-URL examples.
    """
    url_pattern = re.compile(r'http://\S+|https://\S+|www\.\S+', re.IGNORECASE)
    
    new_rows = []
    count_source = 0
    
    for _, row in df.iterrows():
        if row["label"] != 1:
            continue
        
        text = row["text"]
        if re.search(url_pattern, text):
            count_source += 1
            no_url_text = re.sub(url_pattern, '', text).strip()
            no_url_text = re.sub(r'\s+', ' ', no_url_text)
            if len(no_url_text) > 10 and no_url_text != text:
                new_rows.append({"text": no_url_text, "label": 1})
    
    print(f"Created {len(new_rows)} extra SMISH messages with URLs removed "
          f"(from {count_source} original smish-with-URL messages).")
    
    if new_rows:
        df_out = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    else:
        df_out = df.copy()
    
    print("Shape after smish URL-removal augmentation:", df_out.shape)
    print(df_out["label"].value_counts().rename({0: "ham", 1: "smish"}))
    
    return df_out


In [284]:
# ================================================================
# CELL 7: Generate Balanced Examples (HAM-with-URL + SMISH-without-URL)
# ================================================================

def generate_balanced_examples(num_ham=50, num_smish=50, smish_url_percentage=0):
    """
    Generate balanced examples using templates.
    """
    examples = []
    
    # ----- HAM with real-looking URLs -----
    real_ham_domains = [
        'squ.edu.om',
        'omantel.om',
        'bankmuscat.com',
        'omanair.com',
        'amazon.com',
        'paypal.com',
        'google.com',
        'microsoft.com',
        'ups.com',
        'fedex.com',
        'dhl.com',
        'netflix.com',
        'spotify.com',
    ]
    
    ham_url_templates = [
        "Track your order: https://track.{domain}",
        "Your statement is ready: https://billing.{domain}",
        "Log in to your portal: https://portal.{domain}",
        "Verify your email: https://accounts.{domain}/verify",
        "View your invoice: https://secure.{domain}/invoice",
        "Manage your subscription: https://manage.{domain}",
        "Shipment update: https://tracking.{domain}",
        "Update your profile: https://profile.{domain}",
        "Your receipt is available: https://receipts.{domain}",
        "Check your schedule: https://calendar.{domain}",
    ]
    
    for _ in range(num_ham):
        domain = random.choice(real_ham_domains)
        template = random.choice(ham_url_templates)
        message = template.format(domain=domain)
        examples.append({'text': message, 'label': 0})
    
    # ----- SMISH without URL -----
    smish_templates = [
        "ALERT: Your {account} has been {threat}. {action} now to avoid {consequence}.",
        "We tried to deliver your {item} but failed. Reply with your {info} to reschedule.",
        "Your {service} has been pre-approved! Send your {sensitive_info} to finalize the process.",
        "FINAL WARNING: Your {service} will be {threat} unless you {action} now.",
        "You have an unpaid {payment}. Reply YES to see the amount and avoid {consequence}.",
        "Your account has been selected for a random {check}. Send your {info} to continue using the service.",
        "Congratulations, you are our lucky winner! Reply with your {info} to claim your {prize}.",
        "Due to suspicious activity, we need to verify your identity. Reply with your {sensitive_info}.",
        "Your {service} will be {threat} in 24 hours. Reply with your {info} to avoid disconnection.",
        "{urgency}: Your {account} requires immediate verification. {action} to prevent {consequence}.",
        "You've been selected for a {prize}. Text {keyword} to this number to collect your reward.",
    ]
    
    smish_accounts = ['bank account', 'credit card', 'PayPal account', 'account', 'mobile account']
    smish_threats = ['locked', 'suspended', 'blocked', 'frozen', 'deactivated', 'terminated']
    smish_actions = ['Call', 'Text', 'Reply', 'Confirm', 'Verify', 'Update']
    smish_consequences = ['closure', 'legal action', 'permanent suspension', 'account termination', 'service loss']
    smish_items = ['package', 'parcel', 'order', 'delivery', 'shipment']
    smish_info = ['full name and ID number', 'personal details', 'ID number', 'full name', 'date of birth']
    smish_services = ['loan', 'credit card', 'mobile number', 'SIM card', 'phone service']
    smish_sensitive_info = ['salary details', 'date of birth', 'national ID', 'bank details', 'PIN']
    smish_payments = ['fine', 'bill', 'invoice', 'charge', 'payment']
    smish_checks = ['security check', 'verification', 'audit', 'review']
    smish_prizes = ['prize', 'reward', 'gift card', 'cash prize', 'iPhone']
    smish_urgency = ['URGENT', 'ALERT', 'WARNING', 'IMMEDIATE ACTION REQUIRED', 'FINAL NOTICE']
    smish_keywords = ['CLAIM', 'WIN', 'YES', 'CONFIRM', 'VERIFY']
    
    for _ in range(num_smish):
        template = random.choice(smish_templates)
        message = template.format(
            account=random.choice(smish_accounts),
            threat=random.choice(smish_threats),
            action=random.choice(smish_actions),
            consequence=random.choice(smish_consequences),
            item=random.choice(smish_items),
            info=random.choice(smish_info),
            service=random.choice(smish_services),
            sensitive_info=random.choice(smish_sensitive_info),
            payment=random.choice(smish_payments),
            check=random.choice(smish_checks),
            prize=random.choice(smish_prizes),
            urgency=random.choice(smish_urgency),
            keyword=random.choice(smish_keywords)
        )
        examples.append({'text': message, 'label': 1})
    
    return pd.DataFrame(examples)


def add_balanced_examples(df: pd.DataFrame, url_stats: dict) -> pd.DataFrame:
    """
    Add balanced examples based on URL analysis.
    
    - Generate ham messages WITH URLs so that:
      ham_with_url_final ‚âà smish_with_url (from url_stats)
    - Also generate some extra smish-without-URL messages.
    """
    total_messages = len(df)
    
    # From analyze_urls_in_dataset()
    ham_with_url = url_stats['ham_with_url']
    smish_with_url = url_stats['smish_with_url']
    
    # üéØ Target: ham_with_url_final ‚âà smish_with_url
    target_ham_with_url = smish_with_url
    num_ham = max(0, target_ham_with_url - ham_with_url)
    
    # Still add some extra smish-without-URL (1‚Äì2% of dataset)
    num_smish = max(50, int(total_messages * 0.015))
    
    print(f"\n‚úÖ Generating {num_ham} ham-with-URL and {num_smish} smish-without-URL examples...")
    
    extra_df = generate_balanced_examples(
        num_ham=num_ham,
        num_smish=num_smish,
        smish_url_percentage=url_stats['smish_url_percentage']
    )
    
    print(f"‚úÖ Adding {len(extra_df)} balanced examples.")
    df_out = pd.concat([df, extra_df], ignore_index=True)
    print(f"üìä After adding balanced examples: {df_out.shape}")
    print(df_out['label'].value_counts().rename({0: 'ham', 1: 'smish'}))
    return df_out

In [290]:
# ================================================================
# CELL 8: Main Pipeline (UPDATED to use combined_cleaned_original_datasets.csv)
# ================================================================

def run_preprocessing():
    COMBINED_PATH = "combined_cleaned_original_datasets.csv"
    OUT_DIR = Path("prep_out")
    OUT_DIR.mkdir(exist_ok=True)
    
    # 1) Load base cleaned dataset
    print("\n" + "="*60)
    print("STEP 1: LOADING CLEANED COMBINED DATASET")
    print("="*60)
    df = load_cleaned_combined(COMBINED_PATH)
    if df is None:
        return
    print(f"\n‚úÖ Base cleaned dataset: {df.shape}")
    
    df.to_csv(OUT_DIR / "cleaned_base.csv", index=False)
    print(f"üíæ Saved base cleaned dataset to: {OUT_DIR / 'cleaned_base.csv'}")

    # --- Ratio for ORIGINAL CLEANED DATASET ---
    ham = (df['label'] == 0).sum()
    smish = (df['label'] == 1).sum()
    ratio = ham / smish
    
    print("\n--- BALANCE REPORT: ORIGINAL DATASET ---")
    print(f"HAM:   {ham}")
    print(f"SMISH: {smish}")
    print(f"Ratio (HAM : SMISH) = {ratio:.2f} : 1")
    print("Description: The original dataset is imbalanced, with ham messages more frequent than smishing messages.")

    
    # 2) Analyze URL distribution
    print("\n" + "="*60)
    print("STEP 2: ANALYZING URL DISTRIBUTION")
    print("="*60)
    url_stats = analyze_urls_in_dataset(df)
    
    # 3) Generate extra ham business messages
    print("\n" + "="*60)
    print("STEP 3: GENERATING BUSINESS MESSAGES")
    print("="*60)
    business_df = generate_business_messages(num_messages=1500)
    print(f"üì® Generated business ham messages: {business_df.shape}")
    
    df_base = pd.concat([df, business_df], ignore_index=True)
    df_base = df_base.drop_duplicates(subset=["text", "label"]).reset_index(drop=True)
    print(f"üîπ After adding business messages: {df_base.shape}")
    print(df_base["label"].value_counts().rename({0: "ham", 1: "smish"}))

    # --- Ratio AFTER BUSINESS MESSAGES ---
    ham = (df_base['label'] == 0).sum()
    smish = (df_base['label'] == 1).sum()
    ratio = ham / smish
    
    print("\n--- BALANCE REPORT: AFTER BUSINESS MESSAGES ---")
    print(f"HAM:   {ham}")
    print(f"SMISH: {smish}")
    print(f"Ratio (HAM : SMISH) = {ratio:.2f} : 1")
    print("Description: Business HAM messages added realistic ham-with-URL examples, increasing ham count and improving class diversity.")

    
    # 4) Apply SimpleSMSAugmenter
    print("\n" + "="*60)
    print("STEP 4: APPLYING SMS AUGMENTATION")
    print("="*60)
    augmenter = SimpleSMSAugmenter()
    augmented_df = augmenter.augment_dataset(
        df_base,
        target_ham_count=6000,   # ham already large, so mostly affects smish
        target_smish_count=3000
    )
    print(f"üîπ After SimpleSMSAugmenter: {augmented_df.shape}")
    print(augmented_df["label"].value_counts().rename({0: "ham", 1: "smish"}))

    # --- Ratio AFTER SMS AUGMENTER ---
    ham = (augmented_df['label'] == 0).sum()
    smish = (augmented_df['label'] == 1).sum()
    ratio = ham / smish
    
    print("\n--- BALANCE REPORT: AFTER SimpleSMSAugmenter ---")
    print(f"HAM:   {ham}")
    print(f"SMISH: {smish}")
    print(f"Ratio (HAM : SMISH) = {ratio:.2f} : 1")
    print("Description: SMSAugmenter increased both classes. Smish was upsampled more heavily to reduce imbalance.")

    
    # 5) URL-removal augmentation for smish
    print("\n" + "="*60)
    print("STEP 5: APPLYING URL REMOVAL FOR SMISHING")
    print("="*60)
    augmented_df = apply_smish_url_removal(augmented_df)

    # --- Ratio AFTER SMISH URL-REMOVAL AUGMENTATION ---
    ham = (augmented_df['label'] == 0).sum()
    smish = (augmented_df['label'] == 1).sum()
    ratio = ham / smish
    
    print("\n--- BALANCE REPORT: AFTER SMISH URL-REMOVAL ---")
    print(f"HAM:   {ham}")
    print(f"SMISH: {smish}")
    print(f"Ratio (HAM : SMISH) = {ratio:.2f} : 1")
    print("Description: URL-removal generated additional smish-without-URL messages, improving linguistic variety in smishing examples.")

    
    # 6) Add balanced examples (HAM-with-URL + SMISH-without-URL)
    print("\n" + "="*60)
    print("STEP 6: ADDING BALANCED EXAMPLES")
    print("="*60)
    final_df = add_balanced_examples(augmented_df, url_stats)

    # --- Ratio AFTER BALANCED EXAMPLES ---
    ham = (final_df['label'] == 0).sum()
    smish = (final_df['label'] == 1).sum()
    ratio = ham / smish
    
    print("\n--- BALANCE REPORT: AFTER BALANCED EXAMPLES ---")
    print(f"HAM:   {ham}")
    print(f"SMISH: {smish}")
    print(f"Ratio (HAM : SMISH) = {ratio:.2f} : 1")
    print("Description: Balanced examples added HAM-with-URL and SMISH-without-URL samples, stabilizing class distribution while maintaining realism.")

    
    # 7) Final dedup + shuffle
    print("\n" + "="*60)
    print("STEP 7: FINAL CLEANUP")
    print("="*60)
    final_df = final_df.drop_duplicates(subset=["text", "label"])
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"üîπ After deduplication and shuffle: {final_df.shape}")

    # üîç NEW: Analyze URL distribution AFTER all augmentations & cleanup
    print("\n" + "="*60)
    print("STEP 7B: ANALYZING URL DISTRIBUTION AFTER FINAL CLEANUP")
    print("="*60)
    final_url_stats = analyze_urls_in_dataset(final_df)
    
    # 8) Train/Val/Test split
    print("\n" + "="*60)
    print("STEP 8: SPLITTING INTO TRAIN/VAL/TEST")
    print("="*60)
    train_df, test_df = train_test_split(
        final_df,
        test_size=0.2,
        stratify=final_df["label"],
        random_state=42
    )
    
    train_df, val_df = train_test_split(
        train_df,
        test_size=0.1,
        stratify=train_df["label"],
        random_state=42
    )
    
    # 9) Save outputs
    final_df.to_csv(OUT_DIR / "augmented_full_dataset.csv", index=False)
    train_df.to_csv(OUT_DIR / "train.csv", index=False)
    val_df.to_csv(OUT_DIR / "val.csv", index=False)
    test_df.to_csv(OUT_DIR / "test.csv", index=False)
    
    print("\n" + "="*60)
    print("üéØ PREPROCESSING COMPLETED!")
    print("="*60)
    print(f"Total:  {len(final_df):5d} messages")
    print(f"Train:  {len(train_df):5d} messages")
    print(f"Val:    {len(val_df):5d} messages")
    print(f"Test:   {len(test_df):5d} messages")


In [292]:
# ================================================================
# CELL 9: Execute Preprocessing
# ================================================================

run_preprocessing()



STEP 1: LOADING CLEANED COMBINED DATASET
Loaded combined dataset from combined_cleaned_original_datasets.csv with shape (5944, 3)
Columns: ['label', 'text', 'label_num']

Label distribution in base cleaned dataset:
label
ham      5025
smish     919
Name: count, dtype: int64

‚úÖ Base cleaned dataset: (5944, 2)
üíæ Saved base cleaned dataset to: prep_out\cleaned_base.csv

--- BALANCE REPORT: ORIGINAL DATASET ---
HAM:   5025
SMISH: 919
Ratio (HAM : SMISH) = 5.47 : 1
Description: The original dataset is imbalanced, with ham messages more frequent than smishing messages.

STEP 2: ANALYZING URL DISTRIBUTION
üìä URL DISTRIBUTION IN DATASET
HAM with URL:         3 (0.1%)
HAM without URL:   5022 (99.9%)
SMISH with URL:     134 (14.6%)
SMISH without URL:  785 (85.4%)

üìé Sample HAM URLs (first 5):
  - www.fullonsms.com

üö® Sample SMISHING URLs (first 5):
  - https://ukhmrc-tax-refund.comÔøΩto
  - www.txt82228.com.
  - http://www.e-tlp.co.uk/expressoffer
  - www.phb1.com
  - www.movietriv