In [1]:
import pandas as pd
from transformers import pipeline
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from collections import Counter
import os
from datetime import datetime

# 英文停用词列表
EN_STOP_WORDS = set([
    'the', 'a', 'an', 'in', 'to', 'for', 'and', 'is', 'are', 'was', 'were',
    'of', 'with', 'on', 'at', 'by', 'from', 'this', 'that', 'it', 'its',
    'he', 'she', 'they', 'their', 'has', 'have', 'had', 'not', 'but', 'or',
    'be', 'as', 'you', 'your', 'we', 'our', 'if', 'will', 'would', 'there',
    'here', 'all', 'any', 'one', 'more', 'so', 'than', 'then', 'these', 'those',
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through',
    'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then',
    'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
    'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's',
    't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o',
    're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn',
    'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn',
    'weren', 'won', 'wouldn'
])

# 文本预处理函数
def preprocess_text(text):
    # 1. 移除非英文字符和数字（保留英文、数字和空格）
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))

    # 2. 转换为小写
    text = text.lower()

    # 3. 分词
    words = text.split()

    # 4. 移除停用词和短词
    words = [word for word in words if word not in EN_STOP_WORDS and len(word) > 1]

    # 5. 移除纯数字词
    words = [word for word in words if not word.isdigit()]

    # 6. 拼接回文本
    return " ".join(words)

# 初始化情感分析模型（仅加载一次）
print("Loading sentiment analysis model...")
classifier = pipeline("sentiment-analysis")

# 定义五个角色
characters = ["Zayne", "Rafayel", "Sylus", "Xavier", "Caleb"]

# 为每个角色处理CSV文件并进行情感分析
for character in characters:
    print(f"\n\n=== Starting analysis for character: {character} ===")

    # 创建角色结果保存目录
    result_dir = f"{character}_sentiment_results"
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    # 获取该角色的所有CSV文件（假设文件名包含角色名）
    csv_files = [f for f in os.listdir() if f.endswith('.csv') and character.lower() in f.lower()]

    if not csv_files:
        print(f"Warning: No CSV files found for character {character}")
        continue

    print(f"Found {len(csv_files)} CSV files for {character}")

    # 合并多个CSV文件中的数据
    all_data = []
    for csv_file in csv_files:
        print(f"\nProcessing file: {csv_file}")
        try:
            df = pd.read_csv(csv_file)
            all_data.append(df)
            print(f"Successfully read {len(df)} records")
        except Exception as e:
            print(f"Error reading file {csv_file}: {e}")

    if not all_data:
        print(f"Error: No valid data found for character {character}")
        continue

    # 合并数据
    df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal records for {character}: {len(df)}")

    # 检查数据概况
    print(f"Dataset contains {len(df.columns)} columns")
    print(f"Does 'text' column exist: {'text' in df.columns}")

    # 提取文本列并过滤空值
    texts = df['text'].dropna().tolist()
    print(f"\nFound {len(texts)} valid texts")

    # 应用文本预处理
    print("\nPreprocessing texts...")
    preprocessed_texts = [preprocess_text(text) for text in tqdm(texts, desc="Preprocessing")]
    # 过滤空文本
    valid_preprocessed_texts = [text for text in preprocessed_texts if text.strip()]
    print(f"After preprocessing, found {len(valid_preprocessed_texts)} valid texts")

    # 显示前3条预处理后的文本作为示例
    if valid_preprocessed_texts:
        print("\nPreprocessed example texts:")
        for i, text in enumerate(valid_preprocessed_texts[:3]):
            print(f"{i+1}. {text[:100]}...")
    else:
        print("\nNo valid preprocessed texts found after filtering")
        continue

    # 分析情感
    print("\nStarting sentiment analysis (this may take a few minutes)...")
    results = []

    # 使用进度条
    for text in tqdm(valid_preprocessed_texts, desc="Analysis Progress"):
        try:
            # 截断过长文本（模型有长度限制）
            truncated_text = text[:512] if len(text) > 512 else text

            # 分析情感
            result = classifier(truncated_text)[0]
            results.append({
                'original_text': text,  # 保留原始预处理文本
                'text': text,  # 为保持兼容性，沿用原字段名
                'character': character,  # 添加角色标识
                'sentiment': result['label'],
                'confidence': result['score']
            })

        except Exception as e:
            # 处理可能的错误
            results.append({
                'original_text': text,
                'text': text,
                'character': character,
                'sentiment': 'ERROR',
                'confidence': 0.0
            })

    # 创建结果DataFrame
    if not results:
        print(f"Error: No sentiment analysis results for character {character}")
        continue

    results_df = pd.DataFrame(results)

    # 统计分析
    print("\n=== Sentiment Analysis Statistics ===")
    sentiment_counts = results_df['sentiment'].value_counts()
    print(sentiment_counts)
    print(f"\nPositive Ratio: {(sentiment_counts.get('POSITIVE', 0) / len(results_df) * 100):.1f}%")
    print(f"Negative Ratio: {(sentiment_counts.get('NEGATIVE', 0) / len(results_df) * 100):.1f}%")

    # 词频统计
    all_words = []
    for text in valid_preprocessed_texts:
        all_words.extend(text.split())
    word_freq = Counter(all_words)
    print("\n=== Top 20 Most Frequent Words ===")
    for word, count in word_freq.most_common(20):
        print(f"{word}: {count} times")

    # 可视化情感分类结果
    plt.figure(figsize=(14, 10))

    # 1. 情感分布饼图
    plt.subplot(2, 2, 1)
    colors = ['#4CAF50', '#F44336', '#FFC107']  # 绿色(积极), 红色(消极), 黄色(错误)
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=colors)
    plt.title(f'{character} - Sentiment Distribution')

    # 2. 情感分布柱状图
    plt.subplot(2, 2, 2)
    sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
    plt.title(f'{character} - Sentiment Count')
    plt.ylabel('Number of Posts')

    # 3. 高频词柱状图
    plt.subplot(2, 1, 2)  # 调整为跨列布局，避免重叠
    if word_freq:
        top_words, top_counts = zip(*word_freq.most_common(15))
        sns.barplot(x=top_words, y=top_counts, palette='viridis')
        plt.title(f'{character} - Top 15 Frequent Words')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45, ha='right')

    plt.tight_layout()
    vis_filename = f"{result_dir}/{character}_sentiment_visualization.png"
    plt.savefig(vis_filename, dpi=300)
    print(f"\nVisualization saved to: {vis_filename}")
    plt.close()

    # 查看高置信度的积极和消极样本
    print("\n=== High-confidence Positive Samples (Top 5) ===")
    positive_samples = results_df[results_df['sentiment'] == 'POSITIVE'].nlargest(5, 'confidence')
    for idx, row in positive_samples.iterrows():
        print(f"Confidence: {row['confidence']:.2%}")
        print(f"Text: {row['text'][:150]}...")
        print("-" * 50)

    print("\n=== High-confidence Negative Samples (Top 5) ===")
    negative_samples = results_df[results_df['sentiment'] == 'NEGATIVE'].nlargest(5, 'confidence')
    for idx, row in negative_samples.iterrows():
        print(f"Confidence: {row['confidence']:.2%}")
        print(f"Text: {row['text'][:150]}...")
        print("-" * 50)

    # 保存结果
    output_file = f"{result_dir}/{character}_sentiment_results.csv"
    results_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"\nResults saved to: {output_file}")

    # 可选：将情感结果合并回原始数据
    df_with_sentiment = df.copy()
    df_with_sentiment['sentiment'] = None
    df_with_sentiment['sentiment_confidence'] = None
    df_with_sentiment['character'] = character  # 添加角色列

    # 匹配并更新情感结果（基于原始文本）
    print("\nMerging sentiment results back to original data...")
    for idx, row in tqdm(df_with_sentiment.iterrows(), total=len(df_with_sentiment), desc="Merging"):
        if pd.notna(row['text']):
            # 预处理原始文本用于匹配
            processed_text = preprocess_text(row['text'])
            matching_result = results_df[results_df['original_text'] == processed_text]
            if not matching_result.empty:
                df_with_sentiment.at[idx, 'sentiment'] = matching_result.iloc[0]['sentiment']
                df_with_sentiment.at[idx, 'sentiment_confidence'] = matching_result.iloc[0]['confidence']

    # 保存完整数据
    complete_output_file = f"{result_dir}/{character}_complete_with_sentiment.csv"
    df_with_sentiment.to_csv(complete_output_file, index=False, encoding='utf-8')
    print(f"Complete data saved to: {complete_output_file}")

    print(f"\nAnalysis for {character} completed!\n")

# 生成所有角色的综合分析
print("=== Generating comprehensive analysis for all characters ===")
all_char_results = []

# 收集所有角色的结果
for character in characters:
    result_dir = f"{character}_sentiment_results"
    results_file = f"{result_dir}/{character}_sentiment_results.csv"

    if os.path.exists(results_file):
        try:
            char_results = pd.read_csv(results_file)
            all_char_results.append(char_results)
            print(f"Loaded {len(char_results)} results for {character}")
        except Exception as e:
            print(f"Error loading results for {character}: {e}")
    else:
        print(f"Results file not found for {character}")

if all_char_results:
    # 合并所有角色的结果
    combined_results = pd.concat(all_char_results, ignore_index=True)

    # 保存合并结果
    combined_output = "all_characters_sentiment_combined.csv"
    combined_results.to_csv(combined_output, index=False, encoding='utf-8')
    print(f"\nCombined results saved to: {combined_output}")

    # 生成综合可视化
    plt.figure(figsize=(16, 12))

    # 1. 各角色情感分布对比
    plt.subplot(2, 2, 1)
    for character in characters:
        char_data = combined_results[combined_results['character'] == character]
        sentiment_counts = char_data['sentiment'].value_counts(normalize=True)
        if 'POSITIVE' in sentiment_counts:
            plt.plot(character, sentiment_counts['POSITIVE'], 'o-', label=f'{character} Positive')
        if 'NEGATIVE' in sentiment_counts:
            plt.plot(character, sentiment_counts['NEGATIVE'], 's-', label=f'{character} Negative')

    plt.title('Sentiment Distribution by Character')
    plt.ylabel('Proportion')
    plt.legend()
    plt.xticks(rotation=45, ha='right')

    # 2. 各角色情感数量对比
    plt.subplot(2, 2, 2)
    char_counts = combined_results['character'].value_counts()
    sns.barplot(x=char_counts.index, y=char_counts.values, palette='viridis')
    plt.title('Number of Analyzed Posts by Character')
    plt.ylabel('Number of Posts')
    plt.xticks(rotation=45, ha='right')

    # 3. 所有角色高频词云（需要WordCloud库，若没有请先安装: pip install wordcloud）
    try:
        from wordcloud import WordCloud
        all_words = " ".join(combined_results[combined_results['sentiment'] == 'POSITIVE']['text'].tolist())
        plt.subplot(2, 2, 3)
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Positive Sentiment Across All Characters')
    except ImportError:
        print("Warning: WordCloud library not installed, skipping word cloud visualization")

    plt.tight_layout()
    comprehensive_vis = "all_characters_sentiment_visualization.png"
    plt.savefig(comprehensive_vis, dpi=300)
    print(f"Comprehensive visualization saved to: {comprehensive_vis}")
    plt.close()

    # 综合统计
    print("\n=== Comprehensive Sentiment Statistics Across All Characters ===")
    overall_sentiment = combined_results['sentiment'].value_counts()
    print(overall_sentiment)
    print(f"\nOverall Positive Ratio: {(overall_sentiment.get('POSITIVE', 0) / len(combined_results) * 100):.1f}%")
    print(f"Overall Negative Ratio: {(overall_sentiment.get('NEGATIVE', 0) / len(combined_results) * 100):.1f}%")

    print("\nAnalysis completed for all characters!")
else:
    print("No results available to generate comprehensive analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Loading sentiment analysis model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu




=== Starting analysis for character: Zayne ===
Found 2 CSV files for Zayne

Processing file: Zayne2.csv
Successfully read 327 records

Processing file: Zayne1.csv
Successfully read 178 records

Total records for Zayne: 505
Dataset contains 796 columns
Does 'text' column exist: True

Found 477 valid texts

Preprocessing texts...


Preprocessing: 100%|██████████| 477/477 [00:00<00:00, 49397.14it/s]


After preprocessing, found 468 valid texts

Preprocessed example texts:
1. zayne im lost words loveanddeepspace lndsea zayneloveanddeepspace...
2. getting married monday hope see ac a1exmp3 lexx aphelia soff zy zayne zayneedit loveanddeepspacezayn...
3. zayne zayneloveanddeepspace zaynedit loveanddeepspace loveanddeepspaceedit...

Starting sentiment analysis (this may take a few minutes)...


Analysis Progress: 100%|██████████| 468/468 [00:44<00:00, 10.55it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_words, y=top_counts, palette='viridis')



=== Sentiment Analysis Statistics ===
sentiment
NEGATIVE    255
POSITIVE    213
Name: count, dtype: int64

Positive Ratio: 45.5%
Negative Ratio: 54.5%

=== Top 20 Most Frequent Words ===
zayne: 431 times
loveanddeepspace: 312 times
fyp: 247 times
zayneloveanddeepspace: 157 times
zayn: 102 times
loveanddeepspaceedit: 97 times
zaynmalik: 75 times
sylus: 66 times
loveanddeepspacezayne: 60 times
zaynedit: 57 times
zayneedit: 54 times
edit: 52 times
rafayel: 52 times
lads: 49 times
foryoupage: 45 times
otomegame: 44 times
lishen: 44 times
loveanddeepspacecharacters: 43 times
xavier: 43 times
cosplay: 40 times

Visualization saved to: Zayne_sentiment_results/Zayne_sentiment_visualization.png

=== High-confidence Positive Samples (Top 5) ===
Confidence: 99.98%
Text: warm hug heart zayn onedirection fypp...
--------------------------------------------------
Confidence: 99.96%
Text: happy chinese new year loveanddeepspace zayne caleb fyp handsome china...
--------------------------------------

Merging: 100%|██████████| 505/505 [00:00<00:00, 1112.57it/s]


Complete data saved to: Zayne_sentiment_results/Zayne_complete_with_sentiment.csv

Analysis for Zayne completed!



=== Starting analysis for character: Rafayel ===
Found 1 CSV files for Rafayel

Processing file: Rafayel.csv
Successfully read 208 records

Total records for Rafayel: 208
Dataset contains 749 columns
Does 'text' column exist: True

Found 208 valid texts

Preprocessing texts...


Preprocessing: 100%|██████████| 208/208 [00:00<00:00, 23274.96it/s]


After preprocessing, found 208 valid texts

Preprocessed example texts:
1. cant post main account loveanddeepspacerafayel loveanddeepspace rafayelloveanddeepspace loveanddeeps...
2. rafayel finally marrying beloved bride loveanddeepspace rafayel loveanddeepspacerafayel fyp lads...
3. hes cute cant rafayel rafayelloveanddeepspace otomegame loveanddeepspace fyp...

Starting sentiment analysis (this may take a few minutes)...


Analysis Progress: 100%|██████████| 208/208 [00:22<00:00,  9.37it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_words, y=top_counts, palette='viridis')



=== Sentiment Analysis Statistics ===
sentiment
NEGATIVE    113
POSITIVE     95
Name: count, dtype: int64

Positive Ratio: 45.7%
Negative Ratio: 54.3%

=== Top 20 Most Frequent Words ===
rafayel: 256 times
loveanddeepspace: 191 times
fyp: 146 times
rafayelloveanddeepspace: 132 times
loveanddeepspaceedit: 69 times
rafayeledit: 61 times
loveanddeepspacerafayel: 54 times
otomegame: 52 times
lads: 47 times
otome: 47 times
edit: 37 times
love: 26 times
loveanddeepspacecharacters: 25 times
foryou: 22 times
foryoupage: 19 times
viral: 17 times
sylus: 17 times
hes: 16 times
im: 16 times
rafayelloveanddeepspaceedit: 14 times

Visualization saved to: Rafayel_sentiment_results/Rafayel_sentiment_visualization.png

=== High-confidence Positive Samples (Top 5) ===
Confidence: 99.93%
Text: rafayel officially rafayels birthday month yall idea much love rafayel character honestly well written sad time hes meets eye thats sure happy birthda...
--------------------------------------------------
Confiden

Merging: 100%|██████████| 208/208 [00:00<00:00, 1582.39it/s]


Complete data saved to: Rafayel_sentiment_results/Rafayel_complete_with_sentiment.csv

Analysis for Rafayel completed!



=== Starting analysis for character: Sylus ===
Found 2 CSV files for Sylus

Processing file: Sylus2.csv
Successfully read 218 records

Processing file: Sylus1.csv
Successfully read 138 records

Total records for Sylus: 356
Dataset contains 950 columns
Does 'text' column exist: True

Found 356 valid texts

Preprocessing texts...


Preprocessing: 100%|██████████| 356/356 [00:00<00:00, 27104.23it/s]


After preprocessing, found 337 valid texts

Preprocessed example texts:
1. sylus please lord sylus loveanddeepspace loveanddeepspaceedit sylusloveanddeepspaceedit love deep sp...
2. need biker sylus loveanddeepspace sylus sylusloveanddeepspace loveanddeepspaceedit sylusedit fyp...
3. im alive sylus loveanddeepspace loveanddeepspacegame fyp fypviral foryou...

Starting sentiment analysis (this may take a few minutes)...


Analysis Progress: 100%|██████████| 337/337 [00:34<00:00,  9.73it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_words, y=top_counts, palette='viridis')



=== Sentiment Analysis Statistics ===
sentiment
NEGATIVE    261
POSITIVE     76
Name: count, dtype: int64

Positive Ratio: 22.6%
Negative Ratio: 77.4%

=== Top 20 Most Frequent Words ===
sylus: 386 times
loveanddeepspace: 246 times
fyp: 197 times
sylusloveanddeepspace: 192 times
sylusedit: 120 times
loveanddeepspaceedit: 97 times
lads: 75 times
otomegame: 58 times
loveanddeepspacesylus: 50 times
loveanddeepspacecharacters: 38 times
viral: 35 times
im: 31 times
foryoupage: 31 times
otome: 26 times
foryou: 24 times
sylusloveanddeepspaceedit: 22 times
love: 22 times
edit: 22 times
qinche: 20 times
xinche: 20 times

Visualization saved to: Sylus_sentiment_results/Sylus_sentiment_visualization.png

=== High-confidence Positive Samples (Top 5) ===
Confidence: 99.95%
Text: perfect night starlightboys xinche fyp...
--------------------------------------------------
Confidence: 99.89%
Text: handsome boys touch loveanddeepspace sylus zayne rafayel fyp...
----------------------------------------

Merging: 100%|██████████| 356/356 [00:00<00:00, 1453.57it/s]


Complete data saved to: Sylus_sentiment_results/Sylus_complete_with_sentiment.csv

Analysis for Sylus completed!



=== Starting analysis for character: Xavier ===
Found 2 CSV files for Xavier

Processing file: Xavier1.csv
Successfully read 154 records

Processing file: Xavier2.csv
Successfully read 279 records

Total records for Xavier: 433
Dataset contains 833 columns
Does 'text' column exist: True

Found 424 valid texts

Preprocessing texts...


Preprocessing: 100%|██████████| 424/424 [00:00<00:00, 48044.98it/s]


After preprocessing, found 417 valid texts

Preprocessed example texts:
1. xavier xavierloveanddeepspace xavieredit otomegame loveanddeepspacegame...
2. 3dlove deep space rafayel zayne aster sylus...
3. let call daddyloveanddeepspace sylus loveanddeepspaceedit sylusedit fyp xavier...

Starting sentiment analysis (this may take a few minutes)...


Analysis Progress: 100%|██████████| 417/417 [00:34<00:00, 12.12it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_words, y=top_counts, palette='viridis')



=== Sentiment Analysis Statistics ===
sentiment
NEGATIVE    288
POSITIVE    129
Name: count, dtype: int64

Positive Ratio: 30.9%
Negative Ratio: 69.1%

=== Top 20 Most Frequent Words ===
xavier: 338 times
fyp: 228 times
loveanddeepspace: 167 times
xavierloveanddeepspace: 75 times
xaviersobased: 48 times
loveanddeepspaceedit: 45 times
viral: 38 times
loveanddeepspacexavier: 36 times
foryoupage: 36 times
sylus: 32 times
love: 32 times
cos: 32 times
edit: 31 times
rafayel: 30 times
cosplay: 29 times
xavieredit: 28 times
shenxinghui: 25 times
foryou: 25 times
fypviral: 25 times
xavierlegette: 25 times

Visualization saved to: Xavier_sentiment_results/Xavier_sentiment_visualization.png

=== High-confidence Positive Samples (Top 5) ===
Confidence: 99.98%
Text: beauty power lumiere xavier lumiere...
--------------------------------------------------
Confidence: 99.97%
Text: xavier worthy xavier worthy...
--------------------------------------------------
Confidence: 99.95%
Text: funny fyp xu

Merging: 100%|██████████| 433/433 [00:00<00:00, 1356.77it/s]


Complete data saved to: Xavier_sentiment_results/Xavier_complete_with_sentiment.csv

Analysis for Xavier completed!



=== Starting analysis for character: Caleb ===
Found 2 CSV files for Caleb

Processing file: Caleb2.csv
Successfully read 322 records

Processing file: Caleb1.csv
Successfully read 142 records

Total records for Caleb: 464
Dataset contains 882 columns
Does 'text' column exist: True

Found 447 valid texts

Preprocessing texts...


Preprocessing: 100%|██████████| 447/447 [00:00<00:00, 63703.37it/s]


After preprocessing, found 442 valid texts

Preprocessed example texts:
1. im girl caleb loveanddeepspace lads xiayizhou fyp...
2. def care bout yo bodiesfyp...
3. loveanddeepspace lads adsedit loveanddeepspaceedit caleb calebedit edit edits funny caleb caleblovea...

Starting sentiment analysis (this may take a few minutes)...


Analysis Progress: 100%|██████████| 442/442 [00:39<00:00, 11.29it/s]

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)
  sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette=colors)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_words, y=top_counts, palette='viridis')



=== Sentiment Analysis Statistics ===
sentiment
NEGATIVE    280
POSITIVE    162
Name: count, dtype: int64

Positive Ratio: 36.7%
Negative Ratio: 63.3%

=== Top 20 Most Frequent Words ===
caleb: 425 times
fyp: 227 times
loveanddeepspace: 215 times
calebloveanddeepspace: 102 times
lads: 91 times
loveanddeepspaceedit: 89 times
loveanddeepspacecaleb: 63 times
calebedit: 52 times
sylus: 51 times
zayne: 43 times
edit: 41 times
foryoupage: 41 times
xavier: 40 times
rafayel: 39 times
viral: 37 times
foryou: 35 times
loveanddeepspacecharacters: 35 times
xiayizhou: 34 times
otome: 32 times
fypage: 30 times

Visualization saved to: Caleb_sentiment_results/Caleb_sentiment_visualization.png

=== High-confidence Positive Samples (Top 5) ===
Confidence: 99.99%
Text: thank much support newmusic...
--------------------------------------------------
Confidence: 99.98%
Text: reply manhah thanks support guys appreciate...
--------------------------------------------------
Confidence: 99.97%
Text: goodmor

Merging: 100%|██████████| 464/464 [00:00<00:00, 1542.50it/s]


Complete data saved to: Caleb_sentiment_results/Caleb_complete_with_sentiment.csv

Analysis for Caleb completed!

=== Generating comprehensive analysis for all characters ===
Loaded 468 results for Zayne
Loaded 208 results for Rafayel
Loaded 337 results for Sylus
Loaded 417 results for Xavier
Loaded 442 results for Caleb

Combined results saved to: all_characters_sentiment_combined.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=char_counts.index, y=char_counts.values, palette='viridis')


Comprehensive visualization saved to: all_characters_sentiment_visualization.png

=== Comprehensive Sentiment Statistics Across All Characters ===
sentiment
NEGATIVE    1197
POSITIVE     675
Name: count, dtype: int64

Overall Positive Ratio: 36.1%
Overall Negative Ratio: 63.9%

Analysis completed for all characters!
