In [25]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import os
import contractions

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tobia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tobia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tobia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tobia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
# SETTINGS - NUR HIER ETWAS ÄNDERN

# Input file definieren - HIER EINFACH ÄNDERN
df = pd.read_csv('data/tesla_top5_daily.csv')
# Output file definieren - HIER EINFACH ÄNDERN
output_file_path = 'data/tesla_preprocessed_forEmbedding.csv'

usage = 'embedding' # embedding oder sentiment

if usage == 'sentiment':
    lowercase = False
    fix_contractions = True
    normalize_quotes = False
    remove_ellipsis = False
    remove_urls = True
    convert_reddit_syntax = True
    convert_emojis = True
    fix_possessive_s = True
    remove_stopwords = False
    remove_punctuation = False
    remove_special_chars = False
    apply_lemmatization = False
elif usage == 'embedding':
    lowercase = True
    fix_contractions = True
    normalize_quotes = True
    remove_ellipsis = True
    remove_urls = True
    convert_reddit_syntax = True
    convert_emojis = True
    fix_possessive_s = True
    remove_stopwords = True
    remove_punctuation = True
    remove_special_chars = False
    apply_lemmatization = True
else:
    print("Keine gültige usage eingestellt! 'sentiment' oder 'embedding'")


In [27]:
# Text Preprocessing Funktion 
def preprocess_text(text):
    if pd.isna(text):
        return ""

    text = str(text)

    if lowercase:
        text = text.lower()

    if fix_contractions:
        text = contractions.fix(text)

    if normalize_quotes:
        text = re.sub(r'[´`''"""„"’‛‟]', "'", text)  # Alle Varianten zu ' normalisieren
        text = re.sub(r"'{2,}", "'", text)  # Doppelte Anführungszeichen entfernen

    if remove_ellipsis:
        text = re.sub(r'\.{2,}', '', text)  # "..." oder ".." entfernen
        text = re.sub(r'…+', '', text)  # Unicode Ellipsis entfernen
        text = re.sub(r'-{2,}', ' ', text)  # Mehrfache Bindestriche zu Leerzeichen
        text = re.sub(r'\s+', ' ', text)  # Mehrfache Leerzeichen normalisieren
        text = text.strip()


    if remove_urls:
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    if convert_reddit_syntax:
        text = re.sub(r'u/(\w+)', r'user_\1', text)  # u/username -> user_username
        text = re.sub(r'r/(\w+)', r'subreddit_\1', text)  # r/subreddit -> subreddit_name
        text = re.sub(r'/s', ' sarcasm', text)  # /s -> sarcasm (wichtige Info!)

    if convert_emojis:
        emoji_dict = {
            '🚀': ' rocket ', 
            '📈': ' chart_up ', 
            '📉': ' chart_down ',
            '💎': ' diamond_hands ',
            '🌙': ' moon ',
            '💀': ' dead ',
            '🙌': ' hands ',
            '🔥': ' fire ',
            '💰': ' money ',
            '📊': ' chart ',
            '🐻': ' bear ',
            '🐂': ' bull ',
            '💸': ' money_lost ',
            '🤑': ' money_face ',
            '😭': ' crying ',
            '😂': ' laughing ',
            '🎯': ' target ',
            '⚡': ' lightning ',
            '🌟': ' star ',
            '❤️': ' love ',
        }
        for emoji, word in emoji_dict.items():
            text = text.replace(emoji, word)

    tokens = word_tokenize(text)

    if fix_possessive_s:
        for i in range(1, len(tokens)):
            if tokens[i] in {"'s", "'S"}:
                tokens[i-1] = tokens[i-1] + "'s"
                tokens[i] = ""
        tokens = [token for token in tokens if token]

    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        important_words = {
            # Trading/Investment Begriffe
            'buy', 'sell', 'hold', 'up', 'down', 'bull', 'bear', 
            'call', 'put', 'long', 'short', 'moon', 'rocket',
            # Zeitbezug und Modalverben (wichtig für Prognosen)
            'will', 'would', 'should', 'could', 'might', 'may',
            'now', 'then', 'before', 'after', 'when', 'while', 'until', 'since',
            # Negation (ESSENTIELL für Sentiment)
            'not', 'no', 'nor', 'never',
            # Quantifizierer und Verstärkung
            'more', 'most', 'much', 'many', 'few', 'little', 'less', 'least',
            'very', 'too', 'so', 'quite', 'really', 'just', 'only', 'even', 'still',
            'all', 'any', 'some', 'each', 'every', 'both', 'either', 'neither',
            # Richtung und Position (wichtig für Preisbewegung)
            'above', 'below', 'over', 'under', 'back', 'out', 'off', 'on',
            'here', 'there', 'where',
            # Vergleiche und Relationen
            'than', 'against', 'between', 'through', 'during',
            # Bedingungen und Kausalität
            'if', 'unless', 'because', 'why', 'how',
            # Weitere wichtige Wörter
            'again', 'once', 'own', 'same', 'other', 'another',
        }
        stop_words = stop_words - important_words 
        tokens = [token for token in tokens if token not in stop_words]

    if remove_punctuation:
        if remove_special_chars:
            tokens = [token for token in tokens if token not in string.punctuation]
        else:
            keep_symbols = {'$', '%', '+', '#', '@', '-'}
            harmful = set(string.punctuation) - keep_symbols
            tokens = [token for token in tokens if token not in harmful]

    if apply_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)


# Funktion für ein paar Zahlen/Stats
def get_preprocessing_stats(original_text, processed_text):
    """Statistiken über das Preprocessing"""
    orig_words = len(str(original_text).split())
    proc_words = len(str(processed_text).split())
    reduction = ((orig_words - proc_words) / orig_words * 100) if orig_words > 0 else 0
    return orig_words, proc_words, reduction

In [28]:
# Preprocessing durchführen

print("Starting text preprocessing...")
print(f"Original dataset: {len(df)} rows")

# Originale Spalten für Statistiken speichern
original_title = df['title'].copy()
original_text = df['text'].copy()

# Title und Text preprocessing - ÜBERSCHREIBT Spalten
df['title'] = df['title'].apply(preprocess_text)
df['text'] = df['text'].apply(preprocess_text)

# Statistiken mit originalen Daten
total_orig_words = sum(len(str(text).split()) for text in original_title) + sum(len(str(text).split()) for text in original_text)
total_proc_words = sum(len(str(text).split()) for text in df['title']) + sum(len(str(text).split()) for text in df['text'])
reduction = ((total_orig_words - total_proc_words) / total_orig_words * 100) if total_orig_words > 0 else 0

print(f"Gesamte Wörter original: {total_orig_words:,}")
print(f"Gesamte Wörter nach preprocessing: {total_proc_words:,}")
print(f"Reduktion: {reduction:.1f}%")
print("Preprocessing completed!")

Starting text preprocessing...
Original dataset: 13268 rows
Gesamte Wörter original: 2,240,757
Gesamte Wörter nach preprocessing: 1,574,889
Reduktion: 29.7%
Preprocessing completed!


In [29]:
# Speichern
if os.path.exists(output_file_path):
    print(f'File {output_file_path} already exists!')
else:
    df.to_csv(output_file_path, index=False)
    print(f'Daten gespeichert als: {output_file_path}')

Daten gespeichert als: data/tesla_preprocessed_forEmbedding.csv
