Очистка от стоп слов, смайликов, токенизация, Леммитизация, Обработка сущностей

In [44]:
import re
import pandas as pd
from bs4 import BeautifulSoup

# === EMOJI UNICODE PATTERN ===
EMOJI_PATTERN = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"
    "\U0001F780-\U0001F7FF"
    "\U0001F800-\U0001F8FF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FAFF"
    "\u2600-\u26FF"          # misc symbols
    "\u2700-\u27BF"
    "]+",
    flags=re.UNICODE
)

# === TEXT EMOJIS MAPPING ===
TEXT_EMOJI_MAP = {
    r":\)": "<EMOJI_POS>",
    r"=\)": "<EMOJI_POS>",
    r":d": "<EMOJI_POS>",
    r";\)": "<EMOJI_NEUTRAL>",
    r":\(": "<EMOJI_NEG>",
    r":-\(": "<EMOJI_NEG>",
    r":/": "<EMOJI_NEG>",
}

# === OPTIONAL: simple slang normalization ===
SLANG_MAP = {
    r"\bpls\b": "please",
    r"\bplz\b": "please",
    r"\basap\b": "as soon as possible",
    r"\burg\b": "urgent",
    r"\bu\b": "you",
    r"\br\b": "are",
    r"\btho\b": "though",
}

def preprocess_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    # --- Remove HTML ---
    text = BeautifulSoup(text, "html.parser").get_text(" ")

    # --- Remove emojis ---
    text = EMOJI_PATTERN.sub(" ", text)


    # --- Replace emails ---
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', ' <EMAIL> ', text)

    # --- Replace URLs ---
    text = re.sub(r'https?://\S+|www\.\S+', ' <URL> ', text)

    # --- Replace IPs ---
    text = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', ' <IP> ', text)

    # --- Replace Ticket / IDs ---
    text = re.sub(r'\b[A-Z]{2,}-?\d{3,}\b', ' <ID> ', text)

    # --- Replace nicknames @username ---
    text = re.sub(r'@\w+', ' <USER> ', text)

    # --- Replace dates ---
    text = re.sub(r'\b\d{4}[-/]\d{2}[-/]\d{2}\b', ' <DATE> ', text)
    text = re.sub(r'\b\d{2}[-/]\d{2}[-/]\d{4}\b', ' <DATE> ', text)

    # --- Replace times ---
    text = re.sub(r'\b\d{1,2}:\d{2}\b', ' <TIME> ', text)

    # --- Replace text-based emojis ---
    for pattern, replacement in TEXT_EMOJI_MAP.items():
        text = re.sub(pattern, replacement, text)

    # --- Replace numbers (after removing $) ---
    text = text.replace("$", "")
    text = re.sub(r'\b\d+(\.\d+)?\b', ' <NUM> ', text)

    # --- Remove non-latin characters (optional) ---
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # letter + number OR number + letter
    text = re.sub(r'(?<=\D)(?=\d)|(?<=\d)(?=\D)', ' ', text)

    # --- Lowercase ---
    text = text.lower()

    # --- Slang normalization ---
    for pattern, replacement in SLANG_MAP.items():
        text = re.sub(pattern, replacement, text)

    # --- Remove repeated punctuation ---
    text = re.sub(r'([!?.,]){2,}', r'\1', text)

    # --- Remove extra whitespace ---
    text = re.sub(r'\s+', ' ', text).strip()

    # --- Handle hashtags: remove # but keep word ---
    text = re.sub(r"#(\w+)", r"\1", text)

    return text

In [45]:
df = pd.read_excel("/content/ready_sample.xlsx")
# 'text_column' — имя столбца с исходными текстами

df["text"] = df["text"].apply(preprocess_text)

# --- Дополнительные числовые признаки ---
#df["text_length"] = df["clean_text"].str.len()
#df["word_count"] = df["clean_text"].str.split().str.len()
#df["num_exclamations"] = df["text"].str.count("!")
#df["has_urgent"] = df["clean_text"].str.contains(r'\burgent|as soon as possible|critical\b').astype(int)



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  text = BeautifulSoup(text, "html.parser").get_text(" ")


In [46]:
df.to_excel('Clean_text.xlsx', index=False)