In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import compute_class_weight

In [19]:
data_path = r"../messages.csv"
data = pd.read_csv(data_path, sep=';')
data.head(10)

Unnamed: 0,id,chat_id,user_id,message_id,message,message_info,timestamp,spam
0,28450,-1001370017010,884756749,177797,Мне референтка сказала что как принесёшь серт ...,"{""message_id"": 177797, ""date"": 1736516316, ""ch...",2025-01-10 13:38:36.787444,f
1,28418,-1001358408127,6615370312,30362,"До нашої команди. Гарантуємо розвиток,підтримк...","{""message_id"": 30362, ""date"": 1736514294, ""cha...",2025-01-10 13:04:55.388717,t
2,28452,-1001684546093,842671585,189034,И с кофе для меня,"{""message_id"": 189034, ""date"": 1736516974, ""ch...",2025-01-10 13:49:35.320844,f
3,28457,-1001684546093,232278264,189037,"Должно быть несколько, попробую посмотреть онлайн","{""message_id"": 189037, ""date"": 1736518231, ""ch...",2025-01-10 14:10:32.135793,f
4,28460,-1001292637469,268388996,97377,/spam@konnekt_moder_bot,"{""message_id"": 97377, ""date"": 1736518665, ""cha...",2025-01-10 14:17:45.726193,f
5,28462,-1001370017010,1260268668,177802,у меня не смогут кредиты забрать,"{""message_id"": 177802, ""date"": 1736519005, ""ch...",2025-01-10 14:23:25.587145,f
6,28451,-1001358408127,175682846,30363,/spam@konnekt_moder_bot,"{""message_id"": 30363, ""date"": 1736516319, ""cha...",2025-01-10 13:38:39.93748,f
7,28453,-1001684546093,2023802489,189035,Хорошо,"{""message_id"": 189035, ""date"": 1736517131, ""ch...",2025-01-10 13:52:11.340339,f
8,28463,-1001370017010,1260268668,177803,я думаю дадут 4 за сертификат просто,"{""message_id"": 177803, ""date"": 1736519052, ""ch...",2025-01-10 14:24:13.211165,f
9,28467,-1001219742780,5742150043,22718,"Всем привет, кто-то знает какие открытые вопро...","{""message_id"": 22718, ""date"": 1736519724, ""cha...",2025-01-10 14:35:25.54905,f


pre-processing and data cleaning

In [20]:
# keep only message and spam columns
cleaned_dataset = data[['message', 'spam']]

# drop rows with missing values
cleaned_dataset = cleaned_dataset.dropna()

# transform spam column to binary {0, 1}
cleaned_dataset['spam'] = cleaned_dataset['spam'].map({'t': 1, 'f': 0})

# Normalization

In [21]:
import re
import ftfy
import unicodedata
import textacy.preprocessing as tp

In [None]:
# NORMALIZE
# IMPORTANT: This function should be applied to the text before any other processing or inference
def normalize_text(text, lowercase=True) -> str:
    mapper = {
        "emojies": "<e>",
        "mails": "<m>",
        "logins": "<l>",
        "hashtags": "<h>",
        "commands": "<c>",
        "urls": "<u>",
        "phone_numbers": "<p>"
    }
    # Fix unicode Issues (e.g., "CafÃ©" → "Café")
    text = ftfy.fix_text(text)

    # Normalize diacritics and symbols (e.g., "cafè" → "cafe")
    text = unicodedata.normalize('NFD', text)

    # Replace special characters (emojies, logins, hashtags, etc.)
    text = tp.replace.emojis(text, mapper["emojies"])
    text = tp.replace.emails(text, mapper["mails"])
    text = tp.replace.urls(text, mapper["urls"])
    text = tp.replace.hashtags(text, mapper["hashtags"])
    text = tp.replace.user_handles(text, mapper["logins"])
    text = tp.replace.phone_numbers(text, mapper["logins"])

    # Remove punctuation and extra whitespace
    text = tp.remove.punctuation(text) 
    text = tp.normalize.whitespace(text)

    # Keep only Cyrillic and Latin characters
    text = re.sub(r'[^a-zA-Zа-яА-ЯёЁ0-9\s.,!?<>]', '', text)

    # Convert to lowercase if needed
    return text.lower() if lowercase else text

samples = [
    "Тестовое сообщение",
    "CafÃ©",
    "Café and   cafè are the same",
    "ＡＢＣ",
    "I'm a café owner @cafe",
    "Café Déjà Vu – 価格: ¥1000",
    "Ｆｕｌｌｗｉｄｔｈ Ｔｅｘｔ ＠Ｅｍａｉｌ！",
    "🔥🔥 SPAM!!! 🚀💰",
    "Price: $1,234.50 and email: test@example.com",
]
for sample in samples:
    print(f"Original: {sample}")
    print(f"Normalized: {normalize_text(sample, lowercase=True)}")
    print()

Original: Тестовое сообщение
Normalized: тестовое сообщение

Original: CafÃ©
Normalized: cafe

Original: Café and   cafè are the same
Normalized: cafe and cafe are the same

Original: ＡＢＣ
Normalized: abc

Original: I'm a café owner @cafe
Normalized: i m a cafe owner <l>

Original: Café Déjà Vu – 価格: ¥1000
Normalized: cafe deja vu  1000

Original: Ｆｕｌｌｗｉｄｔｈ Ｔｅｘｔ ＠Ｅｍａｉｌ！
Normalized: fullwidth text <l>

Original: 🔥🔥 SPAM!!! 🚀💰
Normalized: <e><e> spam <e><e>

Original: Price: $1,234.50 and email: test@example.com
Normalized: price 1 234 50 and email <m>



In [23]:
def print_stats(data):
    print(f"Total: {len(data)}")
    print(f"Spam: {len(data[data['spam'] == 1])}")
    print(f"Ham: {len(data[data['spam'] == 0])}")

In [24]:
# apply normalization to the whole dataset
cleaned_dataset['message'] = cleaned_dataset['message'].apply(normalize_text)
print_stats(cleaned_dataset)

Total: 52324
Spam: 1255
Ham: 51069


In [25]:
# remove empty rows
cleaned_dataset = cleaned_dataset.dropna()

# remove rows with empty message
cleaned_dataset = cleaned_dataset[cleaned_dataset['message'].str.len() > 0]

# If at least one message of duplicates plenty is spam, then all of them are spam
cleaned_dataset = cleaned_dataset.groupby('message').agg({'spam': 'max'}).reset_index()

# remove duplicates, keep the first message
cleaned_dataset = cleaned_dataset.drop_duplicates(subset='message', keep='first')

print_stats(cleaned_dataset)

Total: 44110
Spam: 830
Ham: 43280


In [26]:
# rename message to text and spam to label
cleaned_dataset = cleaned_dataset.rename(columns={'message': 'text', 'spam': 'label'})
cleaned_dataset.head(10)

Unnamed: 0,text,label
0,\n\n\nисточник траффика <e>\n<e> наша база 400...,1
1,\n\n<e> <e><e><e><e><e><e>\n<e><e><e><e><e><e>...,1
2,\n\n<e><e><e><e><e><e><e> <e><e><e><e><e>\n<e>...,1
3,\n \n \n 300 <e><e><e>\n <e>,1
4,\n<e><e><e><e><e> <e><e>\n<e><e><e><e><e><e>\n...,1
5,\n<e><e><e><e><e><e>\n <e><e> <e><e> <e><e><e>...,1
6,\n<e><e><e><e><e><e><e><e><e><e>\n<e><e><e><e>...,1
7,\nдаже такои дурачок в физике как я написал те...,0
8,\nесли не это не последнии термин зкоушки на о...,0
9,\nпросто возле fakulta strojni висит почему то,0


In [None]:
# Save as a clean dataset
cleaned_dataset.to_csv('../cleaned_dataset.csv', index=False)