filter主要作用
step1：清洗
1.表情符
2.链接
3.主题评论后的股票代码tag（此处我有一个惊天大发现，所有的tag前都有两个空格）
4.格式问题

step2：筛选
1.字数太少的无意义评论
2.广告评论
3.无关评论（只是带了tag）

考虑到运行速度

In [60]:
import pandas as pd
import re

In [72]:
df = pd.read_csv('tweets_nasdaq100_1.csv', nrows=500)

In [87]:

# 定义一个函数来清洗文本
def clean_text(text):
    text = text.lower()  # 转换为小写
    text = re.sub(r'\s{2,}.*', '', text) # 删除非主体部分的股票代码（两个空格后的内容）
    text = re.sub(r'[^\x00-\x7F]+', '', text)# 去除表情符号
    text = re.sub(r'http\S+|www.\S+', '', text)  # 移除 URL
    text = re.sub(r'@\w+', '', text)  # 移除 @用户名
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # 移除 #话题标签
    text = re.sub(r'RT\s+', '', text)  # 移除 RT 转发标记
    text = re.sub(r'[^\w\s$]', '', text)# 去除特殊字符和标点符号（保留$符号，因为它用于表示股票代码）
    text = ' '.join(text.split()) # 去除多余的空格
    return text


# 应用清洗函数到Text列
df['Cleaned_Text'] = df['Text'].apply(clean_text)


# 保存清洗后的数据到新的Excel文件
df.to_csv('cleaned_data.csv', index=False)

# 打印清洗后的数据
print(df[['Cleaned_Text']])

                                          Cleaned_Text
0         14 some of my favorite optionsfriendly names
1                                                 last
2    dont lose your money before analysis and live ...
3    a strong dec 34 led to my pf closing the year ...
4    $vgt vanguard information tech etf up by 145 s...
..                                                 ...
495  despite very strong sales eps fell $adbe in th...
496  $adbe declared bottomline grew on the slower p...
497                                   bot $adbe at 491
498  covered $si $splk $ddog $low $adbe $gm $tsla b...
499            cheapest megacap stocks for jan 28 2022

[500 rows x 1 columns]


In [88]:
def filter_irrelevant_comments(text, target_stock):
    """
    排除出不包含目标股票(如$ABDE) 且包含其他股票($XXXX) 的文本
    :return: 符合条件的评论列表
    """
    # 正则匹配所有形式为 $xxxx 的股票代码
    stock_symbols = re.findall(r'\$\w+', text)
    
    # 如果没有股票代码，直接返回 FALSE
    if not stock_symbols:
        return False
    
    # 过滤掉 $abde
    filtered_symbols = [symbol for symbol in stock_symbols if symbol.lower() != f'${target_stock.lower()}']
    
    # 条件：文本不能包含 "abde"，但包含其他股票代码
    return target_stock.lower() not in text.lower() and len(filtered_symbols) > 0
    
    
# 过滤 DataFrame
df_related = df[df.apply(lambda row: filter_irrelevant_comments(row['Cleaned_Text'], row['Company'])==False, axis=1)].copy()
print(df_related)


    Company  Tweet_Count                                               Text  \
0      ADBE            1  14. Some of my favorite options-friendly names...   
1      ADBE            2  ⌛ Last  day went pretty well   ✅Big or small ,...   
2      ADBE            3  Don't lose your money before analysis and live...   
3      ADBE            4  A strong Dec (+3.4%) led to my p/f closing the...   
5      ADBE            6  🥳WEEKEND VIDEO 1/2: https://t.co/7io5RfdrA1  $...   
..      ...          ...                                                ...   
495    ADBE          496  Despite very Strong Sales EPS Fell $ADBE #Adob...   
496    ADBE          497  #Adobe $ADBE declared Bottom-line grew on the ...   
497    ADBE          498                                   Bot $Adbe at 491   
498    ADBE          499  Covered $SI $SPLK $ddog $LOW $ADBE $GM $TSLA b...   
499    ADBE          500  Cheapest Mega-Cap Stocks for Jan 28, 2022:  $B...   

                         Created_At  Retweets  Like

In [92]:
def remove_stock_symbols_flexible(text):
    """
    删除所有形如 $XXX...（3-5个字母，不区分大小写）的股票代码
    """
    pattern = re.compile(r'\$\w+', re.IGNORECASE)
    cleaned_text = pattern.sub('', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text
    
df_related['Cleaned_Text'] = df_related['Cleaned_Text'].apply(remove_stock_symbols_flexible)
# 输出结果
print(df_related.head(15))


   Company  Tweet_Count                                               Text  \
0     ADBE            1  14. Some of my favorite options-friendly names...   
1     ADBE            2  ⌛ Last  day went pretty well   ✅Big or small ,...   
2     ADBE            3  Don't lose your money before analysis and live...   
3     ADBE            4  A strong Dec (+3.4%) led to my p/f closing the...   
5     ADBE            6  🥳WEEKEND VIDEO 1/2: https://t.co/7io5RfdrA1  $...   
6     ADBE            7  1/2 I love SAAS businesses. I have a portfolio...   
7     ADBE            8   safeTweet : 98686837-adbe-47c0-bdd5-4f799d20f4e7   
8     ADBE            9  Stocks that I think will return &gt;25-30% fro...   
9     ADBE           10  At the times of inflation,all you need is to b...   
10    ADBE           11  Stock tip of the hour: think about holding ADB...   
11    ADBE           12  Year 13 on my investing journey. Let's see wha...   
12    ADBE           13  $ADBE #Adobe's chief product officer pr

In [112]:
def filter_text(text):
    """
    筛除广告文本、过短文本和无意义文本
    返回True表示保留文本，False表示过滤掉
    """
    cleaned = text
    # 1. 过滤空文本或过短文本（少于5个有意义单词）
    words = [w for w in cleaned.split() if len(w) > 1]  # 忽略单字母"单词"
    if len(words) < 5:
        return False
    
    # 2. 过滤无意义文本（检查是否包含足够实词）
    stop_words = {'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'of', 'to', 'in', 'it', 'this', 'that', 'i'}
    content_words = [w for w in words if w not in stop_words]
    if len(content_words) < 3:  # 至少3个有意义的词
        return False
    
    # 3. 过滤广告文本（使用关键词和模式匹配）
    ad_patterns = [
        r'\b(?:live\s+support|trade\s+ideas|scanner|analysis|market)\b',
        r'\b(?:join\s+now|subscribe|limited\s+time|offer|discount)\b',
        r'\b(?:don\'?t\s+(?:miss|lose)|money\s+back)\b',
        r'\b(?:day\s+trading|stock\s+market|profit|earn\s+money)\b',
        r'\b(?:click|link|website|visit|check\s+out)\b',
        r'\b(?:free\s+trial|bonus|promo|giveaway)\b',
        r'\!{2,}|\?{2,}',  # 多个感叹号或问号
        r'\b(?:guarantee|results|performance|success)\b'
    ]
    
    for pattern in ad_patterns:
        if re.search(pattern, cleaned):
            return False
    
    # 4. 过滤无实质内容文本（检查是否只是重复字符或单词）
    if len(set(words)) < 2:  # 所有单词都相同
        return False
    
    # 6. 过滤纯符号或数文本
    if re.fullmatch(r'[\d\W_]+', cleaned.replace(' ', '')):
        return False
    
    return True






In [113]:

# 过滤 DataFrame
df_filtered = df_related[df_related['Cleaned_Text'].apply(filter_text)].copy()
print(df_filtered)



    Company  Tweet_Count                                               Text  \
0      ADBE            1  14. Some of my favorite options-friendly names...   
3      ADBE            4  A strong Dec (+3.4%) led to my p/f closing the...   
6      ADBE            7  1/2 I love SAAS businesses. I have a portfolio...   
8      ADBE            9  Stocks that I think will return &gt;25-30% fro...   
9      ADBE           10  At the times of inflation,all you need is to b...   
..      ...          ...                                                ...   
493    ADBE          494  $ADBE popped early filled our 28Feb $485p orde...   
494    ADBE          495  The 100 largest US companies in a handy Finviz...   
495    ADBE          496  Despite very Strong Sales EPS Fell $ADBE #Adob...   
496    ADBE          497  #Adobe $ADBE declared Bottom-line grew on the ...   
499    ADBE          500  Cheapest Mega-Cap Stocks for Jan 28, 2022:  $B...   

                         Created_At  Retweets  Like

In [110]:
# 保存清洗后的数据到新的Excel文件
df_filtered.to_csv('filtered_data.csv', index=True)

