In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 下载必要的NLTK数据（第一次运行需要）
nltk.download('stopwords')
nltk.download('punkt')

def clean_tweet_text(text):
    if pd.isna(text):
        return ""
    
    # 转换为小写
    text = text.lower()
    
    # 移除URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 移除用户提及 @username
    text = re.sub(r'@\w+', '', text)
    
    # 移除话题标签 #hashtag (但保留文本部分)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # 移除HTML标签
    text = re.sub(r'<.*?>', '', text)
    
    # 移除标点符号
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 移除数字
    text = re.sub(r'\d+', '', text)
    
    # 移除多余空白字符
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def advanced_tweet_cleaning(text):
    if pd.isna(text):
        return ""
    
    # 基础清洗
    text = clean_tweet_text(text)
    
    # 分词：优先使用 NLTK 的 word_tokenize，缺少资源时回退到正则分词
    try:
        tokens = word_tokenize(text)
    except LookupError:
        tokens = re.findall(r"\b\w+\b", text)

    # 移除停用词：优先使用 NLTK 停用词，缺少资源时使用内置小型停用词集合
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        stop_words = {
            'the','a','an','and','or','but','if','in','on','for','with',
            'is','it','this','that','to','of','at','from','by','as','are',
            'was','were','be','been','has','have','had','not','no','so',
            'too','very'
        }

    # 保留否定词（重要：先保留否定词，再去除其他停用词）
    negation_words = ['not', 'no', 'never', 'nothing', 'nowhere', 'neither', 'nor', 
                      'very', 'so', 'too', 'extremely', 'absolutely']
    stop_words = set(w for w in stop_words if w not in negation_words)

    tokens = [word for word in tokens if word not in stop_words]
    
    # 移除短词（长度小于2的单词）
    important_single_letters = {'i', 'a', 'u'}  # I, a, you的缩写
    tokens = [word for word in tokens if len(word) > 1 or word in important_single_letters]
    
    # 重新组合为文本
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [3]:
def handle_emojis_and_special_chars(text):
    """
    处理表情符号：可以选择移除或替换为文字描述
    """
    # 方法1: 移除所有表情符号
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # 表情符号
                           u"\U0001F300-\U0001F5FF"  # 符号和象形文字
                           u"\U0001F680-\U0001F6FF"  # 交通和地图符号
                           u"\U0001F1E0-\U0001F1FF"  # 国旗
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text)
    
    # 方法2: 替换常见表情符号为文字描述（可选）
    emoji_to_text = {
        ':)': 'smile',
        ':(': 'sad',
        ':D': 'laugh',
        ';)': 'wink'
        # 可以添加更多映射
    }
    
    for emoji, desc in emoji_to_text.items():
        text = text.replace(emoji, desc)
    
    return text

In [4]:
def complete_text_cleaning(df, text_column='text'):
    """
    完整的文本清洗流程
    """
    print("开始文本清洗...")
    
    # 复制数据以避免修改原数据
    df_cleaned = df.copy()
    
    # 1. 处理缺失值
    df_cleaned[text_column] = df_cleaned[text_column].fillna('')
    
    # 2. 基础文本清洗
    df_cleaned['cleaned_text'] = df_cleaned[text_column].apply(clean_tweet_text)
    
    # 3. 处理表情符号
    df_cleaned['cleaned_text'] = df_cleaned['cleaned_text'].apply(handle_emojis_and_special_chars)
    
    # 4. 高级清洗（分词、去停用词等）
    df_cleaned['processed_text'] = df_cleaned['cleaned_text'].apply(advanced_tweet_cleaning)
    
    # 5. 移除空文本
    original_count = len(df_cleaned)
    df_cleaned = df_cleaned[df_cleaned['processed_text'].str.len() > 0]
    cleaned_count = len(df_cleaned)
    
    print(f"清洗完成！原始数据: {original_count} 条，清洗后: {cleaned_count} 条")
    print(f"移除了 {original_count - cleaned_count} 条空文本")
    
    return df_cleaned

In [5]:
import os
train_path = os.path.join('dataset', 'twitter_training.csv')
val_path = os.path.join('dataset', 'twitter_validation.csv')
train_cleaned_path = os.path.join('dataset', 'twitter_training_cleaned.csv')
val_cleaned_path = os.path.join('dataset', 'twitter_validation_cleaned.csv')

df = pd.read_csv(train_path, names=['ID','area','attitude','text'], header=None)

# 打印列名，帮助定位文本列
print('数据列:', df.columns.tolist())
print('原始数据示例:')
print(df.head())

# 自动检测合适的文本列
possible_text_cols = ['text','tweet','content','message','comment','body','sentiment_text']
text_cols = [c for c in df.columns if c.lower() in possible_text_cols]
if len(text_cols) == 0:
    obj_cols = df.select_dtypes(include=['object']).columns.tolist()
    if len(obj_cols) == 0:
        raise ValueError('未在数据中找到文本列（object dtype）。请检查CSV列名。')
    chosen = obj_cols[0]
    print(f"没有找到常见的文本列，使用第一列: {chosen}")
else:
    chosen = text_cols[0]
    print(f"使用检测到的文本列: {chosen}")

# 应用清洗流程（使用检测到的列名）
df_cleaned = complete_text_cleaning(df, text_column=chosen)

# 查看清洗前后的对比（谨慎处理索引越界）
print('\n清洗前后对比:')
sample_idx = 0
if sample_idx >= len(df):
    sample_idx = df.index[0]
orig_text = df.iloc[sample_idx][chosen] if chosen in df.columns else ''
print('原始文本:', orig_text)
print('清洗后文本:', df_cleaned.iloc[sample_idx]['processed_text'] if sample_idx in df_cleaned.index else df_cleaned['processed_text'].iloc[0])

# 检查清洗效果
print('\n清洗后的文本统计:')
print(f"平均长度: {df_cleaned['processed_text'].str.len().mean():.2f} 字符")
print(f"最短文本: {df_cleaned['processed_text'].str.len().min()} 字符")
print(f"最长文本: {df_cleaned['processed_text'].str.len().max()} 字符")

数据列: ['ID', 'area', 'attitude', 'text']
原始数据示例:
     ID         area  attitude  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
使用检测到的文本列: text
开始文本清洗...
清洗完成！原始数据: 74682 条，清洗后: 72142 条
移除了 2540 条空文本

清洗前后对比:
原始文本: im getting on borderlands and i will murder you all ,
清洗后文本: im getting borderlands murder

清洗后的文本统计:
平均长度: 72.15 字符
最短文本: 1 字符
最长文本: 839 字符


In [6]:
df_cleaned.to_csv(train_cleaned_path, index=False)

In [7]:
# 读取并清洗验证集
df_val = pd.read_csv(val_path, names=['ID','area','attitude','text'], header=None)

# 打印列名与示例以便确认
print('验证数据列:', df_val.columns.tolist())
print(df_val.head())

# 自动检测文本列（与训练集相同的策略）
possible_text_cols = ['text','tweet','content','message','comment','body','sentiment_text']
text_cols = [c for c in df_val.columns if c.lower() in possible_text_cols]
if len(text_cols) == 0:
    obj_cols = df_val.select_dtypes(include=['object']).columns.tolist()
    if len(obj_cols) == 0:
        raise ValueError('未在验证数据中找到文本列（object dtype）。请检查CSV列名。')
    chosen_val = obj_cols[0]
    print(f"没有找到常见的文本列，使用第一列: {chosen_val}")
else:
    chosen_val = text_cols[0]
    print(f"使用检测到的文本列: {chosen_val}")

# 对验证集应用相同的清洗流程
df_val_cleaned = complete_text_cleaning(df_val, text_column=chosen_val)

# 保存清洗后的验证集
df_val_cleaned.to_csv(val_cleaned_path, index=False)
print('已保存: twitter_validation_cleaned.csv')


验证数据列: ['ID', 'area', 'attitude', 'text']
     ID       area    attitude  \
0  3364   Facebook  Irrelevant   
1   352     Amazon     Neutral   
2  8312  Microsoft    Negative   
3  4371      CS-GO    Negative   
4  4433     Google     Neutral   

                                                text  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Microsoft Why do I pay for WORD when it funct...  
3  CSGO matchmaking is so full of closet hacking,...  
4  Now the President is slapping Americans in the...  
使用检测到的文本列: text
开始文本清洗...
清洗完成！原始数据: 1000 条，清洗后: 1000 条
移除了 0 条空文本
已保存: twitter_validation_cleaned.csv
