In [2]:
import pandas as pd
import re
import nltk
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer


# ----------- 读取csv文件，基础设定 -----------
# 使用 latin-1 编码读取文件
final_file_path = '/Users/yelv/Desktop/5507小组作业/trans_cleaned(en_sentiment).csv'  # 最终处理后的文件

# 尝试使用不同的编码读取 CSV 文件
try:
    df = pd.read_csv(final_file_path, encoding='utf-8') 
except UnicodeDecodeError as e:
    print(f"读取文件时出错: {e}")

In [None]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# 添加情感评分
df['Sentiment'] = df['Basic_Cleaned'].apply(lambda x: sia.polarity_scores(x))

# 提取综合得分
df['Compound'] = df['Sentiment'].apply(lambda x: x['compound'])

# 根据综合得分确定情感倾向
df['Sentiment_Label'] = df['Compound'].apply(lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral'))

# 查看情感分析结果
print(df[['Basic_Cleaned', 'Sentiment', 'Compound', 'Sentiment_Label']].head(10))

# 将情感分析结果保存到 CSV 文件
df.to_csv(final_file_path, index=False, encoding='utf-8')
print(f"情感分析结果已保存到 {final_file_path}。")

# 统计各情感类别的数量
sentiment_counts = df['Sentiment_Label'].value_counts()

# 打印每个情感类别的数量
print("各情感类别数量:")
for label, count in sentiment_counts.items():
    print(f"{label}: {count}")

# 计算总评论数
total_comments = len(df)

# 计算比例
sentiment_ratios = sentiment_counts / total_comments * 100

# 打印结果
print("情感分析结果比例:")
print(sentiment_ratios)

# 保存情感比例结果到 CSV 文件
sentiment_ratios_df = sentiment_ratios.reset_index()
sentiment_ratios_df.columns = ['Sentiment_Label', 'Percentage']
sentiment_ratios_output_file = 'sentiment_ratios.csv'
sentiment_ratios_df.to_csv(sentiment_ratios_output_file, index=False, encoding='utf-8')
print(f"情感比例结果已保存到 {sentiment_ratios_output_file}。")

In [None]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('sentiment_ratios.csv')
import matplotlib.pyplot as plt

# 情绪名称和比率
labels = df['Sentiment_Label']
sizes = df['Percentage']
colors = ['gold', 'lightcoral', 'lightskyblue', 'lightgreen']  # 为每种情绪设置颜色

# 绘制饼状图
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Sentiment Distribution')
plt.show()

In [None]:
import pandas as pd

# 读取CSV文件
file_path = '/Users/yelv/Desktop/翻译后清洗分词/trans_merged_output(保留lang).csv'
data = pd.read_csv(file_path)

# 提取“Language”列
languages = data['Language']

# 计算每种语言的评论数量
language_counts = languages.value_counts()

# 打印结果
print(language_counts)

language_counts.to_csv('/Users/yelv/Desktop/语言评论数量.csv', header=['Counts'])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 语言简写到正式英文名称的映射字典
language_mapping = {
    'en': 'English',
    'es': 'Spanish',
    'it': 'Italian',
    'de': 'German',
    'fr': 'French',
    'pt': 'Portuguese',
    'id': 'Indonesian',
    'sw': 'Swahili',
    'nl': 'Dutch',
    'sl': 'Slovene',
    'et': 'Estonian',
    'pl': 'Polish',
    'mt': 'Maltese',
    'ko': 'Korean',
    'tl': 'Tagalog',
    'ms': 'Malay',
    'da': 'Danish',
    'sv': 'Swedish',
    'fi': 'Finnish',
    'hi': 'Hindi',
    'ro': 'Romanian',
    'lt': 'Lithuanian',
    'eu': 'Basque',
    'zu': 'Zulu',
    'xh': 'Xhosa',
    'ar': 'Arabic',
    'no': 'Norwegian',
    'cs': 'Czech',
    'vi': 'Vietnamese',
    'jv': 'Javanese',
    'ru': 'Russian',
    'cy': 'Welsh',
    'hr': 'Croatian',
    'af': 'Afrikaans',
    'mg': 'Malagasy',
    'ca': 'Catalan',
    'th': 'Thai',
    'tr': 'Turkish',
    'oc': 'Occitan',
    'br': 'Breton',
    'rw': 'Kinyarwanda',
    'eo': 'Esperanto',
    'lv': 'Latvian',
    'ga': 'Irish',
    'la': 'Latin',
    'hu': 'Hungarian',
    'ne': 'Nepali',
    'mr': 'Marathi',
    'ht': 'Haitian Creole',
    'ja': 'Japanese',
    'bs': 'Bosnian',
    'sk': 'Slovak',
    'gl': 'Galician',
    'vo': 'Volapük',
    'nn': 'Norwegian Nynorsk',
    'an': 'Aragonese',
    'bn': 'Bengali',
    'kn': 'Kannada',
    'lb': 'Luxembourgish',
    'qu': 'Quechua',
    'gu': 'Gujarati',
    'lo': 'Lao',
    'el': 'Greek',
    'is': 'Icelandic',
    'sq': 'Albanian',
    'az': 'Azerbaijani',
    'bg': 'Bulgarian',
    'fa': 'Persian',
    'nb': 'Norwegian Bokmål',
    'sr': 'Serbian',
    'ku': 'Kurdish'
}

# 读取CSV文件
data = pd.read_csv('/Users/yelv/Desktop/语言评论数量.csv')

# 使用映射字典转换语言简写
data['Language'] = data['Language'].map(language_mapping)

# 确保语言列中没有空值
data = data.dropna()

# 保存到新的CSV文件中
data.to_csv('语言评论数量_正式名称.csv', index=False)

# 根据评论数量进行排序，并保留前十的结果
top_10_data = data.sort_values(by='Counts', ascending=False).head(10)
# 创建柱状图
plt.figure(figsize=(12, 8))  # 设置图形大小
plt.bar(top_10_data['Language'], top_10_data['Counts'], color='skyblue')  # 创建柱状图

# 添加标题和标签
plt.title('Top 10 Language Comment Counts')
plt.xlabel('Language')
plt.ylabel('Counts')

# 显示图形
plt.xticks(rotation=45)  # 旋转x轴标签，以便更好地显示
plt.tight_layout()  # 自动调整子图参数, 使之填充整个图像区域
plt.show()

In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

# 初始化VADER分析器
sia = SentimentIntensityAnalyzer()

# 读取CSV文件
file_path = '/Users/yelv/Desktop/5507小组作业/trans_word_frequency_analysis.csv'
data = pd.read_csv(file_path)

# 初始化积极和消极词频计数器
positive_word_count = 0
negative_word_count = 0

# 遍历CSV文件中的词汇，统计积极和消极词频
for index, row in data.iterrows():
    # 确保word是字符串类型
    word = str(row['Word']).lower()  # 假设CSV文件中有一个名为'Word'的列
    # 检查word是否为空字符串
    if word:
        score = sia.polarity_scores(word)
        if score['compound'] > 0.05:  # 假设积极情感的阈值为0.05
            positive_word_count += row['Frequency']
        elif score['compound'] < -0.05:  # 假设消极情感的阈值为-0.05
            negative_word_count += row['Frequency']

# 打印结果
print(f"Positive Word Frequency: {positive_word_count}")
print(f"Negative Word Frequency: {negative_word_count}")

In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer

# 初始化VADER分析器
sia = SentimentIntensityAnalyzer()

# 读取CSV文件
file_path = '/Users/yelv/Desktop/5507小组作业/trans_word_frequency_analysis.csv'
data = pd.read_csv(file_path)

# 初始化积极和消极词频计数器
positive_word_count = {}
negative_word_count = {}

# 遍历CSV文件中的词汇，统计积极和消极词频
for index, row in data.iterrows():
    word = str(row['Word']).lower()  # 假设CSV文件中有一个名为'Word'的列
    score = sia.polarity_scores(word)
    if score['compound'] >= 0.05:  # 假设积极情感的阈值为0.05
        if word in positive_word_count:
            positive_word_count[word] += int(row['Frequency'])
        else:
            positive_word_count[word] = int(row['Frequency'])
    elif score['compound'] <= -0.05:  # 假设消极情感的阈值为-0.05
        if word in negative_word_count:
            negative_word_count[word] += int(row['Frequency'])
        else:
            negative_word_count[word] = int(row['Frequency'])

# 将结果转换为DataFrame
positive_df = pd.DataFrame(list(positive_word_count.items()), columns=['Word', 'Positive_Count'])
negative_df = pd.DataFrame(list(negative_word_count.items()), columns=['Word', 'Negative_Count'])

# 保存为CSV文件
positive_df.to_csv('Positive_Word_Frequency.csv', index=False)
negative_df.to_csv('Negative_Word_Frequency.csv', index=False)

# 打印结果
print(positive_df)
print(negative_df)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 读取CSV文件
file_path = '/Users/yelv/Desktop/5507小组作业/Positive_Word_Frequency.csv'
df = pd.read_csv(file_path)

# 确保词频是按降序排列的，然后获取前20个词汇
df = df.sort_values(by='Positive_Count', ascending=False)
top_20 = df.head(20)

# 绘制柱状图
plt.figure(figsize=(10, 8))  # 设置图形的大小
plt.bar(top_20['Word'], top_20['Positive_Count'], color='skyblue')  # 假设CSV文件中有'Word'和'Frequency'列
plt.xlabel('Word')  # X轴标签
plt.ylabel('Frequency')  # Y轴标签
plt.title('Top 20 Positive Words Frequency')  # 图形标题
plt.xticks(rotation=45)  # 将X轴标签旋转45度，以便更好地显示
plt.tight_layout()  # 自动调整子图参数，使之填充整个图像区域

# 显示图形
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 读取CSV文件
file_path = '/Users/yelv/Desktop/5507小组作业/Negative_Word_Frequency.csv'
df = pd.read_csv(file_path)

# 确保词频是按降序排列的，然后获取前20个词汇
df = df.sort_values(by='Negative_Count', ascending=False)
top_20 = df.head(20)

# 绘制柱状图
plt.figure(figsize=(10, 8))  # 设置图形的大小
plt.bar(top_20['Word'], top_20['Negative_Count'], color='skyblue')  # 假设CSV文件中有'Word'和'Frequency'列
plt.xlabel('Word')  # X轴标签
plt.ylabel('Frequency')  # Y轴标签
plt.title('Top 20 Negative Words Frequency')  # 图形标题
plt.xticks(rotation=45)  # 将X轴标签旋转45度，以便更好地显示
plt.tight_layout()  # 自动调整子图参数，使之填充整个图像区域

# 显示图形
plt.show()
