In [1]:
import os
from textblob import TextBlob
import pandas as pd
import re

# 指定要读取的文件目录路径
directory_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\Cleaned_text'

# 初始化结果列表
results = []

# 定义月份顺序
months_order = ["January", "February", "March", "April", "May", "June", 
                "July", "August", "September", "October", "November", "December"]

# 定义文件排序函数
def sort_key(filename):
    match = re.match(r"([a-zA-Z]+)_(\d{4})\.txt", filename)
    if match:
        month, year = match.groups()
        month_index = months_order.index(month)
        return (int(year), month_index)
    return (9999, 0)  # 默认值以确保无效文件名排在最后

# 读取目录中的所有文件并排序
for filename in sorted(os.listdir(directory_path), key=sort_key):
    file_path = os.path.join(directory_path, filename)
    
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # 使用 TextBlob 进行情绪分析
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # 将结果添加到列表中
    results.append({
        'Filename': filename,
        'Model': 'textblob',
        'Polarity': polarity,
        'Subjectivity': subjectivity
    })

# 创建 DataFrame 并保存为 Excel 文件
df = pd.DataFrame(results)
output_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\results_textblob.xlsx'
df.to_excel(output_path, index=False)

print(f'Sentiment analysis results saved to {output_path}')


Sentiment analysis results saved to C:\Users\22749\Desktop\UoG\Fintech\Dissertation\Data\results_textblob.xlsx


In [12]:
import os
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# 创建VADER情绪分析器实例
analyzer = SentimentIntensityAnalyzer()

# 指定要读取的文件目录路径
directory_path = "C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\News\\monthly\\April_2022.txt"

# 初始化结果列表
results = []


# 获取停用词列表
stop_words = set(stopwords.words('english'))
# 读取目录中的所有文件并排序
    
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
        # 移除停用词
word_tokens = word_tokenize(text)
filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
        
        # 将文本分割成段落（可以根据需要调整分割方式）
paragraphs = filtered_text.split('\n\n')
    
    # 使用 VADER 进行情绪分析
sentiment_scores = analyzer.polarity_scores(text)
positive = sentiment_scores['pos']
neutral = sentiment_scores['neu']
negative = sentiment_scores['neg']
compound = sentiment_scores['compound']
    
for paragraph in paragraphs:
    sentiment_scores = analyzer.polarity_scores(paragraph)
    file_sentiment_scores['Positive'] += sentiment_scores['pos']
    file_sentiment_scores['Neutral'] += sentiment_scores['neu']
    file_sentiment_scores['Negative'] += sentiment_scores['neg']
    file_sentiment_scores['Compound'] += sentiment_scores['compound']

# 计算每个文件的平均情绪得分
result = {
    'Filename': filename,
    'Model': 'vader',
    'Positive': file_sentiment_scores['Positive'] / file_sentiment_scores['Paragraphs'],
    'Neutral': file_sentiment_scores['Neutral'] / file_sentiment_scores['Paragraphs'],
    'Negative': file_sentiment_scores['Negative'] / file_sentiment_scores['Paragraphs'],
    'Compound': file_sentiment_scores['Compound'] / file_sentiment_scores['Paragraphs']
}

print(f'Processed {filename}')
return result

# 创建线程池并行处理文件
with ThreadPoolExecutor() as executor:
    file_list = sorted(os.listdir(directory_path), key=sort_key)
    futures = [executor.submit(process_file, filename) for filename in file_list]
    for future in futures:
        result = future.result()
        if result:
            results.append(result)

# 创建 DataFrame 并保存为 Excel 文件
df = pd.DataFrame(results)
output_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\results_vader.xlsx'
df.to_excel(output_path, index=False)

print(f'Sentiment analysis results saved to {output_path}')


KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from concurrent.futures import ThreadPoolExecutor

# 创建VADER情绪分析器实例
analyzer = SentimentIntensityAnalyzer()

# 指定要读取的文件目录路径
directory_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\1'

# 初始化结果列表
results = []

# 定义月份顺序
months_order = ["January", "February", "March", "April", "May", "June", 
                "July", "August", "September", "October", "November", "December"]

# 定义文件排序函数
def sort_key(filename):
    match = re.match(r"([a-zA-Z]+)_(\d{4})\.txt", filename)
    if match:
        month, year = match.groups()
        month_index = months_order.index(month)
        return (int(year), month_index)
    return (9999, 0)  # 默认值以确保无效文件名排在最后

# 获取停用词列表
stop_words = set(stopwords.words('english'))

# 定义处理单个文件的函数
def process_file(filename):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # 预处理文本：去除标点符号和特殊字符
        text = re.sub(r'[^\w\s]', '', text)
        
        # 移除停用词
        word_tokens = word_tokenize(text)
        filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
        
        # 将文本分割成句子（可以根据需要调整分割方式）
        sentences = sent_tokenize(filtered_text)
        
        # 限制每个段落的句子数
        paragraph_size = 10
        paragraphs = [' '.join(sentences[i:i + paragraph_size]) for i in range(0, len(sentences), paragraph_size)]
        
        # 初始化文件的总情绪得分
        file_sentiment_scores = {
            'Positive': 0.0,
            'Neutral': 0.0,
            'Negative': 0.0,
            'Compound': 0.0,
            'Paragraphs': len(paragraphs)  # 记录段落数目，用于计算平均得分
        }
        
        for paragraph in paragraphs:
            sentiment_scores = analyzer.polarity_scores(paragraph)
            file_sentiment_scores['Positive'] += sentiment_scores['pos']
            file_sentiment_scores['Neutral'] += sentiment_scores['neu']
            file_sentiment_scores['Negative'] += sentiment_scores['neg']
            file_sentiment_scores['Compound'] += sentiment_scores['compound']
        
        # 计算每个文件的平均情绪得分
        result = {
            'Filename': filename,
            'Model': 'vader',
            'Positive': file_sentiment_scores['Positive'] / file_sentiment_scores['Paragraphs'],
            'Neutral': file_sentiment_scores['Neutral'] / file_sentiment_scores['Paragraphs'],
            'Negative': file_sentiment_scores['Negative'] / file_sentiment_scores['Paragraphs'],
            'Compound': file_sentiment_scores['Compound'] / file_sentiment_scores['Paragraphs']
        }

        print(f'Processed {filename}')
        return result

# 创建线程池并行处理文件
with ThreadPoolExecutor() as executor:
    file_list = sorted(os.listdir(directory_path), key=sort_key)
    futures = [executor.submit(process_file, filename) for filename in file_list]
    for future in futures:
        result = future.result()
        if result:
            results.append(result)

# 创建 DataFrame 并保存为 Excel 文件
df = pd.DataFrame(results)
output_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\results_vader.xlsx'
df.to_excel(output_path, index=False)

print(f'Sentiment analysis results saved to {output_path}')


In [None]:
import os
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# 创建VADER情绪分析器实例
analyzer = SentimentIntensityAnalyzer()

# 指定要读取的文件目录路径
directory_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\1'

# 初始化结果列表
results = []

# 定义月份顺序
months_order = ["January", "February", "March", "April", "May", "June", 
                "July", "August", "September", "October", "November", "December"]

# 定义文件排序函数
def sort_key(filename):
    match = re.match(r"([a-zA-Z]+)_(\d{4})\.txt", filename)
    if match:
        month, year = match.groups()
        month_index = months_order.index(month)
        return (int(year), month_index)
    return (9999, 0)  # 默认值以确保无效文件名排在最后

# 获取停用词列表
stop_words = set(stopwords.words('english'))

# 定义处理单个文件的函数
def process_file(filename):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        
        # 获取文件大小
        file_size = os.path.getsize(file_path)
        
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # 预处理文本：去除标点符号和特殊字符
        text = re.sub(r'[^\w\s]', '', text)
        
        # 移除停用词
        word_tokens = word_tokenize(text)
        filtered_text = ' '.join([word for word in word_tokens if word.lower() not in stop_words])
        
        # 分割文本
        if file_size > 500 * 1024:  # 500KB
            num_parts = file_size // (500 * 1024) + 1
            part_length = len(filtered_text) // num_parts
            parts = [filtered_text[i:i + part_length] for i in range(0, len(filtered_text), part_length)]
        else:
            parts = [filtered_text]
        
        # 初始化文件的总情绪得分
        file_sentiment_scores = {
            'Positive': 0.0,
            'Neutral': 0.0,
            'Negative': 0.0,
            'Compound': 0.0,
            'Parts': len(parts)  # 记录部分数目，用于计算平均得分
        }
        
        for idx, part in enumerate(parts):
            sentences = sent_tokenize(part)
            for sentence in sentences:
                sentiment_scores = analyzer.polarity_scores(sentence)
                file_sentiment_scores['Positive'] += sentiment_scores['pos']
                file_sentiment_scores['Neutral'] += sentiment_scores['neu']
                file_sentiment_scores['Negative'] += sentiment_scores['neg']
                file_sentiment_scores['Compound'] += sentiment_scores['compound']
            
            print(f'Processed part {idx + 1} of {len(parts)} of file {filename}')
        
        # 计算每个文件的平均情绪得分
        result = {
            'Filename': filename,
            'Model': 'vader',
            'Positive': file_sentiment_scores['Positive'] / file_sentiment_scores['Parts'],
            'Neutral': file_sentiment_scores['Neutral'] / file_sentiment_scores['Parts'],
            'Negative': file_sentiment_scores['Negative'] / file_sentiment_scores['Parts'],
            'Compound': file_sentiment_scores['Compound'] / file_sentiment_scores['Parts']
        }

        return result

# 处理所有文件并收集结果
file_list = sorted(os.listdir(directory_path), key=sort_key)
for filename in file_list:
    result = process_file(filename)
    if result:
        results.append(result)

# 创建 DataFrame 并保存为 Excel 文件
df = pd.DataFrame(results)
output_path = 'C:\\Users\\22749\\Desktop\\UoG\\Fintech\\Dissertation\\Data\\results_vader.xlsx'
df.to_excel(output_path, index=False)

print(f'Sentiment analysis results saved to {output_path}')
