In [3]:
import pandas as pd
from transformers import pipeline
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from collections import Counter

# 英文停用词列表
EN_STOP_WORDS = set([
    'the', 'a', 'an', 'in', 'to', 'for', 'and', 'is', 'are', 'was', 'were',
    'of', 'with', 'on', 'at', 'by', 'from', 'this', 'that', 'it', 'its',
    'he', 'she', 'they', 'their', 'has', 'have', 'had', 'not', 'but', 'or',
    'be', 'as', 'you', 'your', 'we', 'our', 'if', 'will', 'would', 'there',
    'here', 'all', 'any', 'one', 'more', 'so', 'than', 'then', 'these', 'those',
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
    'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
    't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o',
    're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn',
    'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn',
    'weren', 'won', 'wouldn'
])

# 文本预处理函数
def preprocess_text(text):
    # 1. 移除非英文字符和数字（保留英文、数字和空格）
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))

    # 2. 转换为小写
    text = text.lower()

    # 3. 分词
    words = text.split()

    # 4. 移除停用词和短词
    words = [word for word in words if word not in EN_STOP_WORDS and len(word) > 1]

    # 5. 移除纯数字词
    words = [word for word in words if not word.isdigit()]

    # 6. 拼接回文本
    return " ".join(words)

# 初始化情感分析模型
print("Loading sentiment analysis model...")
classifier = pipeline("sentiment-analysis")

# 单个CSV文件路径
csv_file = '/content/dataset_tiktok-hashtag-scraper_2025-06-17_07-27-31-310.csv'

print(f"\n\n=== Processing file: {csv_file} ===")

# 读取CSV文件
print("\nReading CSV file...")
try:
    df = pd.read_csv(csv_file)
    print(f"Successfully read {len(df)} records")
except FileNotFoundError:
    print(f"Error: File {csv_file} not found")
    print("Please ensure the CSV file is in the current directory")

# 检查数据概况
print(f"\nDataset contains {len(df.columns)} columns")
print(f"Does 'text' column exist: {'text' in df.columns}")

# 提取文本列并过滤空值
texts = df['text'].dropna().tolist()
print(f"\nFound {len(texts)} valid texts")

# 应用文本预处理
print("\nPreprocessing texts...")
preprocessed_texts = [preprocess_text(text) for text in tqdm(texts, desc="Preprocessing")]
# 过滤空文本
valid_preprocessed_texts = [text for text in preprocessed_texts if text.strip()]
print(f"After preprocessing, found {len(valid_preprocessed_texts)} valid texts")

# 显示前3条预处理后的文本作为示例
print("\nPreprocessed example texts:")
for i, text in enumerate(valid_preprocessed_texts[:3]):
    print(f"{i+1}. {text[:100]}...")

# 分析情感
print("\nStarting sentiment analysis (this may take a few minutes)...")
results = []

# 使用进度条
for text in tqdm(valid_preprocessed_texts, desc="Analysis Progress"):
    try:
        # 截断过长文本（模型有长度限制）
        truncated_text = text[:512] if len(text) > 512 else text

        # 分析情感
        result = classifier(truncated_text)[0]
        results.append({
            'original_text': text,  # 保留原始预处理文本
            'text': text,  # 为保持兼容性，沿用原字段名
            'sentiment': result['label'],
            'confidence': result['score']
        })

    except Exception as e:
        # 处理可能的错误
        results.append({
            'original_text': text,
            'text': text,
            'sentiment': 'ERROR',
            'confidence': 0.0
        })

# 创建结果DataFrame
results_df = pd.DataFrame(results)

# 统计分析
print("\n=== Sentiment Analysis Statistics ===")
sentiment_counts = results_df['sentiment'].value_counts()
print(sentiment_counts)
print(f"\nPositive Ratio: {(sentiment_counts.get('POSITIVE', 0) / len(results_df) * 100):.1f}%")
print(f"Negative Ratio: {(sentiment_counts.get('NEGATIVE', 0) / len(results_df) * 100):.1f}%")

# 词频统计
all_words = []
for text in valid_preprocessed_texts:
    all_words.extend(text.split())
word_freq = Counter(all_words)
print("\n=== Top 20 Most Frequent Words ===")
for word, count in word_freq.most_common(20):
    print(f"{word}: {count} times")

# 可视化情感分类结果
plt.figure(figsize=(14, 10))

# 1. 情感分布饼图
plt.subplot(2, 2, 1)
colors = ['#4CAF50', '#F44336', '#FFC107']  # 绿色(积极), 红色(消极), 黄色(错误)
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=colors)
plt.title('Sentiment Distribution')

# 2. 情感分布柱状图
plt.subplot(2, 2, 2)
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
plt.title('Sentiment Count')
plt.ylabel('Number of Posts')

# 3. 高频词柱状图
plt.subplot(2, 1, 2)  # 调整为跨列布局，避免重叠
top_words, top_counts = zip(*word_freq.most_common(15))
sns.barplot(x=top_words, y=top_counts, palette='viridis')
plt.title('Top 15 Frequent Words After Preprocessing')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('tiktok_sentiment_visualization.png', dpi=300)
print(f"\nVisualization saved to: tiktok_sentiment_visualization.png")
plt.close()

# 查看高置信度的积极和消极样本
print("\n=== High-confidence Positive Samples (Top 5) ===")
positive_samples = results_df[results_df['sentiment'] == 'POSITIVE'].nlargest(5, 'confidence')
for idx, row in positive_samples.iterrows():
    print(f"Confidence: {row['confidence']:.2%}")
    print(f"Text: {row['text'][:150]}...")
    print("-" * 50)

print("\n=== High-confidence Negative Samples (Top 5) ===")
negative_samples = results_df[results_df['sentiment'] == 'NEGATIVE'].nlargest(5, 'confidence')
for idx, row in negative_samples.iterrows():
    print(f"Confidence: {row['confidence']:.2%}")
    print(f"Text: {row['text'][:150]}...")
    print("-" * 50)

# 保存结果
output_file = 'tiktok_sentiment_results.csv'
results_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"\nResults saved to: {output_file}")

# 可选：将情感结果合并回原始数据
df_with_sentiment = df.copy()
df_with_sentiment['sentiment'] = None
df_with_sentiment['sentiment_confidence'] = None

# 匹配并更新情感结果（基于原始文本）
for idx, row in df_with_sentiment.iterrows():
    if pd.notna(row['text']):
        # 预处理原始文本用于匹配
        processed_text = preprocess_text(row['text'])
        matching_result = results_df[results_df['original_text'] == processed_text]
        if not matching_result.empty:
            df_with_sentiment.at[idx, 'sentiment'] = matching_result.iloc[0]['sentiment']
            df_with_sentiment.at[idx, 'sentiment_confidence'] = matching_result.iloc[0]['confidence']

# 保存完整数据
complete_output_file = 'tiktok_complete_with_sentiment.csv'
df_with_sentiment.to_csv(complete_output_file, index=False, encoding='utf-8')
print(f"Complete data saved to: {complete_output_file}")

print("\nAnalysis completed!")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Loading sentiment analysis model...


Device set to use cpu




=== Processing file: /content/dataset_tiktok-hashtag-scraper_2025-06-17_07-27-31-310.csv ===

Reading CSV file...
Successfully read 178 records

Dataset contains 581 columns
Does 'text' column exist: True

Found 173 valid texts

Preprocessing texts...


Preprocessing: 100%|██████████| 173/173 [00:00<00:00, 42470.86it/s]


After preprocessing, found 165 valid texts

Preprocessed example texts:
1. happy new year guys loveanddeepspace zayneloveanddeepspace zayne lishenloveanddeepspace lishen otome...
2. im healer zayne zaynedit loveanddeepspace loveanddeepspaceedit dawnbreaker dawnbreakerzayne...
3. ooc...

Starting sentiment analysis (this may take a few minutes)...


Analysis Progress: 100%|██████████| 165/165 [00:14<00:00, 11.77it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_words, y=top_counts, palette='viridis')



=== Sentiment Analysis Statistics ===
sentiment
POSITIVE    87
NEGATIVE    78
Name: count, dtype: int64

Positive Ratio: 52.7%
Negative Ratio: 47.3%

=== Top 20 Most Frequent Words ===
zayne: 191 times
loveanddeepspace: 137 times
fyp: 89 times
zayneloveanddeepspace: 51 times
lishen: 39 times
loveanddeepspaceedit: 39 times
cosplay: 34 times
zaynedit: 30 times
sylus: 30 times
cos: 29 times
rafayel: 22 times
loveanddeepspacezayne: 18 times
fypage: 17 times
xavier: 14 times
otomegame: 13 times
cosplayer: 13 times
caleb: 13 times
lishenloveanddeepspace: 12 times
loveanddeepspacecharacters: 12 times
capcut: 12 times





Visualization saved to: tiktok_sentiment_visualization.png

=== High-confidence Positive Samples (Top 5) ===
Confidence: 99.96%
Text: happy chinese new year loveanddeepspace zayne caleb fyp handsome china...
--------------------------------------------------
Confidence: 99.92%
Text: cute zayne loveanddeepspace touch zayne zaynedit fyp...
--------------------------------------------------
Confidence: 99.92%
Text: correct way love mc sylus zayne loveanddeepspace funny cosplay...
--------------------------------------------------
Confidence: 99.89%
Text: handsome boys touch loveanddeepspace sylus zayne rafayel fyp...
--------------------------------------------------
Confidence: 99.66%
Text: loveanddeepspacezayne loveanddeepspaceedit loveanddeepspace zayne zayneedit handsome china...
--------------------------------------------------

=== High-confidence Negative Samples (Top 5) ===
Confidence: 99.64%
Text: bbut comparing looks personality girl shut looks nothing like grapist zayne lishe