In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import os

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/christian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/christian/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/christian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/christian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# SETTINGS - NUR HIER IN DIESER ZELLE ETWAS ÄNDERN

# Input file definieren - HIER EINFACH ÄNDERN
df = pd.read_csv('data/tesla_top5_daily.csv')

# Preprocessing Optionen - HIER EINFACH AN/AUS SCHALTEN
remove_urls = True
remove_stopwords = True
remove_special_chars = False  # Finanz-Symbole und Emojis behalten
apply_lemmatization = True  

# Output file definieren - HIER EINFACH ÄNDERN
output_file_path = 'data/tesla_preprocessed.csv'

In [5]:
# Text Preprocessing Funktionen
def preprocess_text(text):
    
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    
    # URLs entfernen
    if remove_urls:
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Spezielle Reddit-Syntax bereinigen (aber Info behalten)
    text = re.sub(r'u/(\w+)', r'user_\1', text)  # u/username -> user_username
    text = re.sub(r'r/(\w+)', r'subreddit_\1', text)  # r/subreddit -> subreddit_name
    text = re.sub(r'/s', ' sarcasm', text)  # /s -> sarcasm (wichtige Info!)
    
    # Emoji-to-Text Konvertierung (wichtig für Sentiment!)
    emoji_dict = {
        '🚀': ' rocket ',
        '📈': ' chart_up ',
        '📉': ' chart_down ',
        '💎': ' diamond_hands ',
        '🌙': ' moon ',
        '💀': ' dead ',
        '🙌': ' hands ',
        '🔥': ' fire ',
        '💰': ' money ',
        '📊': ' chart ',
        '🐻': ' bear ',
        '🐂': ' bull ',
        '💸': ' money_lost ',
        '🤑': ' money_face ',
        '😭': ' crying ',
        '😂': ' laughing ',
        '🎯': ' target ',
        '⚡': ' lightning ',
        '🌟': ' star ',
        '❤️': ' love ',
    }
    for emoji, word in emoji_dict.items():
        text = text.replace(emoji, word)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopwords entfernen
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        # Wichtige Finanz-Begriffe NICHT als Stopwords behandeln (z.B. up/down sind stopwords)
        financial_words = {'buy', 'sell', 'hold', 'up', 'down', 'bull', 'bear', 
                          'call', 'put', 'long', 'short', 'moon', 'rocket'}
        stop_words = stop_words - financial_words 
        tokens = [token for token in tokens if token not in stop_words]
    
    # Interpunktion-Filterung
    if remove_special_chars:
        # Wenn special_chars entfernt werden sollen, normale Punctuation-Filterung
        tokens = [token for token in tokens if token not in string.punctuation]
    else:
        # Wenn special_chars behalten werden, nur störende Punctuation entfernen
        keep_symbols = {'$', '%', '+', '#', '@', '-'}
        harmful_punctuation = set(string.punctuation) - keep_symbols
        tokens = [token for token in tokens if token not in harmful_punctuation]
    
    # Lemmatization
    if apply_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Funktion für ein paar Zahlen/Stats
def get_preprocessing_stats(original_text, processed_text):
    """Statistiken über das Preprocessing"""
    orig_words = len(str(original_text).split())
    proc_words = len(str(processed_text).split())
    reduction = ((orig_words - proc_words) / orig_words * 100) if orig_words > 0 else 0
    return orig_words, proc_words, reduction

In [6]:
# Preprocessing durchführen

print("Starting text preprocessing...")
print(f"Original dataset: {len(df)} rows")

# Originale Spalten für Statistiken speichern
original_title = df['title'].copy()
original_text = df['text'].copy()

# Title und Text preprocessing - ÜBERSCHREIBT Spalten
df['title'] = df['title'].apply(preprocess_text)
df['text'] = df['text'].apply(preprocess_text)

# Statistiken mit originalen Daten
total_orig_words = sum(len(str(text).split()) for text in original_title) + sum(len(str(text).split()) for text in original_text)
total_proc_words = sum(len(str(text).split()) for text in df['title']) + sum(len(str(text).split()) for text in df['text'])
reduction = ((total_orig_words - total_proc_words) / total_orig_words * 100) if total_orig_words > 0 else 0

print(f"Gesamte Wörter original: {total_orig_words:,}")
print(f"Gesamte Wörter nach preprocessing: {total_proc_words:,}")
print(f"Reduktion: {reduction:.1f}%")
print("Preprocessing completed!")

Starting text preprocessing...
Original dataset: 13268 rows
Gesamte Wörter original: 2,240,757
Gesamte Wörter nach preprocessing: 1,451,895
Reduktion: 35.2%
Preprocessing completed!


In [7]:
# Speichern
if os.path.exists(output_file_path):
    print(f'File {output_file_path} already exists!')
else:
    df.to_csv(output_file_path, index=False)
    print(f'Daten gespeichert als: {output_file_path}')

File data/tesla_preprocessed.csv already exists!
