In [1]:
import pandas as pd
import re
import nltk
import warnings
warnings.filterwarnings('ignore')

# Download stopwords for Russian language / –°–∫–∞—á–∏–≤–∞–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ –¥–ª—è —Ä—É—Å—Å–∫–æ–≥–æ —è–∑—ã–∫–∞
nltk.download('stopwords')
from nltk.corpus import stopwords

print("MASTER'S PROJECT: Toxic messages handling project")
print("="*70)

# =============================================================================
# 1. DATA LOADING - EXCEL FILE / –ó–ê–ì–†–£–ó–ö–ê –î–ê–ù–ù–´–• –ò–ó EXCEL
# =============================================================================

print("üì• Loading data from Excel file...")
file_path = "MergedDS.xlsx"  

try:
    # Try to read Excel file / –ü—Ä–æ–±—É–µ–º –ø—Ä–æ—á–∏—Ç–∞—Ç—å Excel —Ñ–∞–π–ª
    df = pd.read_excel(file_path)
    print("‚úÖ Successfully loaded from Excel")
    
except Exception as e:
    print(f"‚ùå Excel loading failed: {e}")
    print("üîÑ Trying CSV with different separators...")
    
    # Fallback to CSV with different separators / –†–µ–∑–µ—Ä–≤–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç —Å —Ä–∞–∑–Ω—ã–º–∏ —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª—è–º–∏
    separators = ['\t', ',', ';', '|']
    
    for sep in separators:
        try:
            df = pd.read_csv(file_path.replace('.xlsx', '.csv'), sep=sep, encoding='utf-8')
            print(f"‚úÖ Loaded with separator: '{sep}'")
            break
        except:
            continue
    else:
        print("‚ùå All loading methods failed")
        exit()

print(f"Original dataset size: {df.shape}")

# Basic information / –ë–∞–∑–æ–≤–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è
print("\nüìä Basic dataset information:")
print(f"Columns: {df.columns.tolist()}")
print(f"Data types:\n{df.dtypes}")

# Display first few rows to check data / –ü–æ–∫–∞–∑—ã–≤–∞–µ–º –ø–µ—Ä–≤—ã–µ —Å—Ç—Ä–æ–∫–∏ –¥–ª—è –ø—Ä–æ–≤–µ—Ä–∫–∏ –¥–∞–Ω–Ω—ã—Ö
print("\nüëÄ First 5 rows of data:")
print(df.head())

# Check if we have the expected columns / –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –æ–∂–∏–¥–∞–µ–º—ã—Ö –∫–æ–ª–æ–Ω–æ–∫
expected_columns = ['text_raw', 'is_toxic', 'toxicity_type', 'source_platform']
available_columns = [col for col in expected_columns if col in df.columns]
print(f"\n‚úÖ Available expected columns: {available_columns}")

# Show sample data from key columns / –ü–æ–∫–∞–∑—ã–≤–∞–µ–º –ø—Ä–∏–º–µ—Ä—ã –¥–∞–Ω–Ω—ã—Ö –∏–∑ –∫–ª—é—á–µ–≤—ã—Ö –∫–æ–ª–æ–Ω–æ–∫
if 'text_raw' in df.columns:
    print(f"\nüìù Sample text_raw values:")
    for i in range(min(3, len(df))):
        text_sample = str(df['text_raw'].iloc[i])[:100]
        print(f"  {i+1}: {text_sample}...")

if 'is_toxic' in df.columns:
    print(f"\nüî¢ is_toxic unique values: {df['is_toxic'].unique()}")

if 'toxicity_type' in df.columns:
    print(f"üè∑Ô∏è toxicity_type unique values: {df['toxicity_type'].unique()}")

if 'source_platform' in df.columns:
    print(f"üåê source_platform unique values: {df['source_platform'].unique()[:5]}")

# Missing values analysis / –ê–Ω–∞–ª–∏–∑ –ø—Ä–æ–ø—É—Å–∫–æ–≤
print(f"\nüîç Missing values:")
missing_data = df.isnull().sum()
for col, count in missing_data.items():
    if count > 0:
        print(f"  {col}: {count} missing ({count/len(df)*100:.1f}%)")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anton\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


MASTER'S PROJECT: Toxic messages handling project
üì• Loading data from Excel file...
‚úÖ Successfully loaded from Excel
Original dataset size: (469967, 9)

üìä Basic dataset information:
Columns: ['Column1', 'raw_text_id', 'dataset_id', 'source_platform', 'nickname', 'is_verified', 'text_raw', 'is_toxic', 'toxicity_type']
Data types:
Column1              int64
raw_text_id          int64
dataset_id           int64
source_platform     object
nickname            object
is_verified          int64
text_raw            object
is_toxic           float64
toxicity_type       object
dtype: object

üëÄ First 5 rows of data:
   Column1  raw_text_id  dataset_id source_platform nickname  is_verified  \
0        0            0           0     2ch, pikabu      NaN            1   
1        1            1           0     2ch, pikabu      NaN            1   
2        2            2           0     2ch, pikabu      NaN            1   
3        3            3           0     2ch, pikabu      NaN        

In [3]:

# =============================================================================
# 2. DATA CLEANING AND TYPE CONVERSION / –û–ß–ò–°–¢–ö–ê –î–ê–ù–ù–´–• –ò –ö–û–ù–í–ï–†–¢–ê–¶–ò–Ø –¢–ò–ü–û–í
# =============================================================================

print("\nüîß Data cleaning and type conversion...")

# Clean column names / –û—á–∏—Å—Ç–∫–∞ –Ω–∞–∑–≤–∞–Ω–∏–π –∫–æ–ª–æ–Ω–æ–∫
df.columns = df.columns.str.strip()
print(f"Cleaned column names: {df.columns.tolist()}")

# Convert is_toxic to numeric / –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º is_toxic –≤ —á–∏—Å–ª–æ–≤–æ–π —Ñ–æ—Ä–º–∞—Ç
if 'is_toxic' in df.columns:
    print(f"is_toxic before conversion: {df['is_toxic'].unique()}")
    df['is_toxic'] = pd.to_numeric(df['is_toxic'], errors='coerce')
    print(f"is_toxic after conversion: {df['is_toxic'].unique()}")

# Clean text_raw column / –û—á–∏—Å—Ç–∫–∞ –∫–æ–ª–æ–Ω–∫–∏ text_raw
if 'text_raw' in df.columns:
    # Remove extra quotes and whitespace / –£–¥–∞–ª—è–µ–º –ª–∏—à–Ω–∏–µ –∫–∞–≤—ã—á–∫–∏ –∏ –ø—Ä–æ–±–µ–ª—ã
    df['text_raw'] = df['text_raw'].astype(str).str.strip().str.strip('"')
    print("Cleaned text_raw column")

# Convert other numeric columns / –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –¥—Ä—É–≥–∏–µ —á–∏—Å–ª–æ–≤—ã–µ –∫–æ–ª–æ–Ω–∫–∏
numeric_columns = ['raw_text_id', 'dataset_id', 'is_verified']
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')



üîß Data cleaning and type conversion...
Cleaned column names: ['Column1', 'raw_text_id', 'dataset_id', 'source_platform', 'nickname', 'is_verified', 'text_raw', 'is_toxic', 'toxicity_type']
is_toxic before conversion: [ 1.  0. nan]
is_toxic after conversion: [ 1.  0. nan]
Cleaned text_raw column


In [5]:

# =============================================================================
# 3. MISSING LABELS HANDLING / –û–ë–†–ê–ë–û–¢–ö–ê –ü–†–û–ü–£–©–ï–ù–ù–´–• –ú–ï–¢–û–ö
# =============================================================================

print("\nüè∑Ô∏è Processing missing labels...")

if 'is_toxic' in df.columns and 'toxicity_type' in df.columns:
    # Check current state of labels / –ü—Ä–æ–≤–µ—Ä—è–µ–º —Ç–µ–∫—É—â–µ–µ —Å–æ—Å—Ç–æ—è–Ω–∏–µ –º–µ—Ç–æ–∫
    print(f"is_toxic unique values: {df['is_toxic'].unique()}")
    print(f"toxicity_type unique values: {df['toxicity_type'].unique()}")

    # Logic for filling missing values: / –õ–æ–≥–∏–∫–∞ –∑–∞–ø–æ–ª–Ω–µ–Ω–∏—è –ø—Ä–æ–ø—É—Å–∫–æ–≤:
    # 1. If is_toxic is empty but toxicity_type is filled (not empty and not NORMAL) ‚Üí is_toxic = 1
    # 1. –ï—Å–ª–∏ is_toxic –ø—É—Å—Ç–æ–µ, –Ω–æ toxicity_type –∑–∞–ø–æ–ª–Ω–µ–Ω–æ (–Ω–µ –ø—É—Å—Ç–æ–µ –∏ –Ω–µ NORMAL) ‚Üí is_toxic = 1
    mask_toxic_missing = df['is_toxic'].isna()
    mask_tox_type_filled = (~df['toxicity_type'].isna()) & (df['toxicity_type'] != '') & (df['toxicity_type'] != 'NORMAL')
    
    fill_toxic_count = (mask_toxic_missing & mask_tox_type_filled).sum()
    df.loc[mask_toxic_missing & mask_tox_type_filled, 'is_toxic'] = 1
    print(f"Filled is_toxic=1 for {fill_toxic_count} rows (toxicity_type indicates toxicity)")

    # 2. If is_toxic = 0, and toxicity_type is empty ‚Üí toxicity_type = 'NORMAL'
    # 2. –ï—Å–ª–∏ is_toxic = 0, –∞ toxicity_type –ø—É—Å—Ç–æ–µ ‚Üí toxicity_type = 'NORMAL'
    mask_toxic_zero = df['is_toxic'] == 0
    mask_tox_type_empty = (df['toxicity_type'].isna()) | (df['toxicity_type'] == '')
    
    fill_normal_count = (mask_toxic_zero & mask_tox_type_empty).sum()
    df.loc[mask_toxic_zero & mask_tox_type_empty, 'toxicity_type'] = 'NORMAL'
    print(f"Filled toxicity_type='NORMAL' for {fill_normal_count} rows")

    # Convert is_toxic to integer / –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º is_toxic –≤ integer 
    if df['is_toxic'].dtype == 'float64':
        df['is_toxic'] = df['is_toxic'].astype('Int64')
        print("Converted is_toxic to integer type")

    # Check results after filling / –ü—Ä–æ–≤–µ—Ä—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –ø–æ—Å–ª–µ –∑–∞–ø–æ–ª–Ω–µ–Ω–∏—è
    print(f"\nüìä After filling missing labels:")
    print(f"is_toxic missing: {df['is_toxic'].isna().sum()} ({df['is_toxic'].isna().sum()/len(df)*100:.1f}%)")
    print(f"toxicity_type missing: {df['toxicity_type'].isna().sum()} ({df['toxicity_type'].isna().sum()/len(df)*100:.1f}%)")



üè∑Ô∏è Processing missing labels...
is_toxic unique values: [ 1.  0. nan]
toxicity_type unique values: [nan 'INSULT' 'NORMAL' 'OBSCENITY' 'THREAT' 'INAPPROPRIATE' 'SENSITIVE']
Filled is_toxic=1 for 124586 rows (toxicity_type indicates toxicity)
Filled toxicity_type='NORMAL' for 65333 rows
Converted is_toxic to integer type

üìä After filling missing labels:
is_toxic missing: 0 (0.0%)
toxicity_type missing: 31765 (6.8%)


In [7]:
# =============================================================================
# 4. TEXT PREPROCESSING / –ü–†–ï–î–û–ë–†–ê–ë–û–¢–ö–ê –¢–ï–ö–°–¢–ê
# =============================================================================

print(f"\nüßπ Starting text preprocessing...")

if 'text_raw' in df.columns:
    # Save initial number of rows / –°–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Å—Ö–æ–¥–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫
    initial_rows = len(df)

    # 1. Remove completely empty rows / –£–¥–∞–ª–µ–Ω–∏–µ –ø–æ–ª–Ω–æ—Å—Ç—å—é –ø—É—Å—Ç—ã—Ö —Å—Ç—Ä–æ–∫
    df = df.dropna(subset=['text_raw'])
    df = df[df['text_raw'] != '']
    df = df[df['text_raw'] != 'nan']
    print(f"Removed empty texts: {initial_rows - len(df)}")

    # Text cleaning function / –§—É–Ω–∫—Ü–∏—è –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞
    def clean_text_advanced(text):
        if pd.isna(text) or text == '' or text == 'nan':
            return ""
        
        text = str(text)
        
        # Convert to lowercase / –ü—Ä–∏–≤–µ–¥–µ–Ω–∏–µ –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
        text = text.lower()
        
        # HTML entities decoding / –î–µ–∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ HTML-—Å—É—â–Ω–æ—Å—Ç–µ–π
        import html
        try:
            text = html.unescape(text)
        except:
            pass
        
        # Remove HTML tags / –£–¥–∞–ª–µ–Ω–∏–µ HTML-—Ç–µ–≥–æ–≤
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove URLs / –£–¥–∞–ª–µ–Ω–∏–µ URL
        text = re.sub(r'http\S+', ' URL ', text)
        
        # Remove emails / –£–¥–∞–ª–µ–Ω–∏–µ email
        text = re.sub(r'\S+@\S+', ' EMAIL ', text)
        
        # Replace numbers / –ó–∞–º–µ–Ω–∞ —á–∏—Å–µ–ª
        text = re.sub(r'\d+', ' NUMBER ', text)
        
        # Process emoticons / –û–±—Ä–∞–±–æ—Ç–∫–∞ —ç–º–æ—Ç–∏–∫–æ–Ω–æ–≤
        text = re.sub(r'[:;=]-?[\)\(DP]', ' EMOTICON ', text)
        
        # Process emojis / –û–±—Ä–∞–±–æ—Ç–∫–∞ —ç–º–æ–¥–∑–∏
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons / —ç–º–æ—Ü–∏–∏
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs / —Å–∏–º–≤–æ–ª—ã –∏ –ø–∏–∫—Ç–æ–≥—Ä–∞–º–º—ã
            u"\U0001F680-\U0001F6FF"  # transport & map symbols / —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç –∏ –∫–∞—Ä—Ç—ã
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS) / —Ñ–ª–∞–≥–∏
            u"\U00002702-\U000027B0"  # other symbols / –¥—Ä—É–≥–∏–µ —Å–∏–º–≤–æ–ª—ã
            u"\U000024C2-\U0001F251" 
            "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r' EMOJI ', text)
        
        # Process repeated punctuation (2+ characters) / –û–±—Ä–∞–±–æ—Ç–∫–∞ –ø–æ–≤—Ç–æ—Ä—è—é—â–µ–π—Å—è –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏ (2+ —Å–∏–º–≤–æ–ª–∞)
        text = re.sub(r'[!?]{2,}', ' REPEAT_PUNCT ', text)  # !! ??? !!!
        text = re.sub(r'\.{2,}', ' REPEAT_PUNCT ', text)    # ... .... ‚Üí REPEAT_PUNCT
        text = re.sub(r'[!?]\.+', ' REPEAT_PUNCT ', text)   # !.. ?... ‚Üí REPEAT_PUNCT
        text = re.sub(r'[!?][!?]+', ' REPEAT_PUNCT ', text) # !? !!? ?! ‚Üí REPEAT_PUNCT
        
        # Remove special characters but keep Russian letters / –£–¥–∞–ª–µ–Ω–∏–µ —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤ —Å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ–º —Ä—É—Å—Å–∫–∏—Ö –±—É–∫–≤
        text = re.sub(r'[^\w\s–∞-—è—ë]', ' ', text)
        
        # Remove extra spaces / –£–¥–∞–ª–µ–Ω–∏–µ –ª–∏—à–Ω–∏—Ö –ø—Ä–æ–±–µ–ª–æ–≤
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        return text

    # Apply cleaning / –ü—Ä–∏–º–µ–Ω—è–µ–º –æ—á–∏—Å—Ç–∫—É
    print("Cleaning texts...")
    df['clean_text'] = df['text_raw'].apply(clean_text_advanced)

    # Remove very short texts (less than 3 characters) / –£–¥–∞–ª–µ–Ω–∏–µ —Å–ª–∏—à–∫–æ–º –∫–æ—Ä–æ—Ç–∫–∏—Ö —Ç–µ–∫—Å—Ç–æ–≤
    initial_count = len(df)
    df = df[df['clean_text'].str.len() >= 3]
    print(f"After removing short texts: {len(df)} rows (removed {initial_count - len(df)})")

    # Remove text duplicates after cleaning / –£–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ —Ç–µ–∫—Å—Ç–∞ –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏
    initial_count = len(df)
    df = df.drop_duplicates(subset=['clean_text'])
    print(f"Removed duplicates: {initial_count - len(df)}")

    # Remove stopwords / –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤
    def remove_stopwords(text):
        if not isinstance(text, str) or text == '':
            return ""
        
        russian_stopwords = set(stopwords.words('russian'))
        custom_stopwords = {
            '—ç—Ç–æ', '–≤–æ—Ç', '–∫–∞–∫', '—Ç–∞–∫', '–∏', '–≤', '–Ω–∞–¥', '–∫', '–¥–æ', '–Ω–µ', '–Ω–∞', '–Ω–æ', '–∑–∞', 
            '—Ç–æ', '—Å', '–ª–∏', '–∞', '–≤–æ', '–æ—Ç', '—Å–æ', '–¥–ª—è', '–æ', '–∂–µ', '–Ω—É', '–≤—ã', '–±—ã', '—á—Ç–æ',
            '–∫—Ç–æ', '–æ–Ω', '–æ–Ω–∞', '–æ–Ω–∏', '–æ–Ω–æ', '–º—ã', '–≤—ã', '–≤–∞—Å', '–≤–∞–º', '—Ç–µ–±–µ', '—Ç—ã', '–º–Ω–µ',
            '–º–µ–Ω—è', '–µ–º—É', '–µ–π', '–∏–º', '–Ω–∏–º–∏', 
            'NUMBER', 'URL', 'EMAIL', 'EMOTICON', 'EMOJI', 'REPEAT_PUNCT'
        }
        
        all_stopwords = russian_stopwords.union(custom_stopwords)
        words = text.split()
        filtered_words = [word for word in words if word not in all_stopwords and len(word) > 2]
        return ' '.join(filtered_words)

    print("Removing stopwords...")
    df['clean_text_no_stopwords'] = df['clean_text'].apply(remove_stopwords)

    print(f"Final dataset size after preprocessing: {len(df)} rows")


üßπ Starting text preprocessing...
Removed empty texts: 0
Cleaning texts...
After removing short texts: 469901 rows (removed 66)
Removed duplicates: 19691
Removing stopwords...
Final dataset size after preprocessing: 450210 rows


In [9]:
# =============================================================================
# 5. BASIC DATASET STATISTICS / –ë–ê–ó–û–í–ê–Ø –°–¢–ê–¢–ò–°–¢–ò–ö–ê –î–ê–¢–ê–°–ï–¢–ê
# =============================================================================

print("\n" + "="*60)
print("BASIC DATASET STATISTICS")
print("="*60)

print(f"Total records: {len(df):,}")

# Analyze is_toxic if available / –ê–Ω–∞–ª–∏–∑ is_toxic –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ
if 'is_toxic' in df.columns:
    toxic_count = df['is_toxic'].sum()
    normal_count = len(df) - toxic_count
    print(f"Toxic records: {toxic_count:,} ({df['is_toxic'].mean():.1%})")
    print(f"Non-toxic records: {normal_count:,} ({(1 - df['is_toxic'].mean()):.1%})")

# Text length if available / –î–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–æ–≤ –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ
if 'clean_text' in df.columns:
    df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))
    df['text_length_no_stopwords'] = df['clean_text_no_stopwords'].apply(lambda x: len(x.split()))

    print(f"\nText length (in words):")
    print(f"Average: {df['text_length'].mean():.1f}")
    print(f"After stopwords removal: {df['text_length_no_stopwords'].mean():.1f}")
    print(f"Maximum: {df['text_length'].max()}")
    print(f"Minimum: {df['text_length'].min()}")

# Analyze source_platform if available / –ê–Ω–∞–ª–∏–∑ source_platform –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ
if 'source_platform' in df.columns:
    print(f"\nüåê Source platform analysis:")
    platform_stats = df['source_platform'].value_counts()
    for platform, count in platform_stats.head(10).items():
        percentage = count / len(df) * 100
        print(f"  {platform}: {count:,} records ({percentage:.1f}%)")

# Analyze is_verified if available / –ê–Ω–∞–ª–∏–∑ is_verified –µ—Å–ª–∏ –¥–æ—Å—Ç—É–ø–Ω–æ
if 'is_verified' in df.columns:
    print(f"\n‚úÖ Verification analysis:")
    verified_stats = df['is_verified'].value_counts()
    for verified, count in verified_stats.items():
        percentage = count / len(df) * 100
        print(f"  {verified}: {count:,} records ({percentage:.1f}%)")



BASIC DATASET STATISTICS
Total records: 450,210
Toxic records: 195,151 (43.3%)
Non-toxic records: 255,059 (56.7%)

Text length (in words):
Average: 19.1
After stopwords removal: 11.0
Maximum: 3213
Minimum: 1

üåê Source platform analysis:
  ok.ru: 244,771 records (54.4%)
  2ch.hk, Pikabu.ru, otveti.mail.ru: 124,435 records (27.6%)
  2ch, vk: 60,673 records (13.5%)
  2ch, pikabu: 14,152 records (3.1%)
  YouTube: 1,937 records (0.4%)
  Social Media, TV-Scripts (South Park): 1,450 records (0.3%)
  –ª–µ–Ω—Ç–∞—á: 981 records (0.2%)
  –º–µ–¥—É–∑–∞: 932 records (0.2%)
  –¥–æ–∂–¥—å: 879 records (0.2%)

‚úÖ Verification analysis:
  1: 383,358 records (85.2%)
  0: 66,852 records (14.8%)


In [13]:
# =============================================================================
# 6. SAVE PROCESSED DATA / –°–û–•–†–ê–ù–ï–ù–ò–ï –û–ë–†–ê–ë–û–¢–ê–ù–ù–´–• –î–ê–ù–ù–´–•
# =============================================================================

print("\nüíæ Saving processed data...")

# Save to Excel
output_path = "processed_toxic_comments_ds.xlsx"
df.to_excel(output_path, index=False)
print(f"‚úÖ Data saved to: {output_path}")
print(f"üìä Records saved: {len(df):,}")


üíæ Saving processed data...
‚úÖ Data saved to: processed_toxic_comments_ds.xlsx
üìä Records saved: 450,210
