In [1]:
import numpy as np
import pandas as pd

In [None]:


# 读取数据
df = pd.read_csv('archive/steam_game_reviews.csv')

# 展示前10条
print(df.head(10))


In [6]:
# 取前1w条数据
df_5w = df.head(50000)

# 按照 'recommendation' 是否为 'Recommended' 分类
df_recommended = df_5w[df_5w['recommendation'] == 'Recommended']
df_not_recommended = df_5w[df_5w['recommendation'] != 'Recommended']

print(f"推荐(Recommended)数量: {len(df_recommended)}")
print(f"不推荐(Not Recommended)数量: {len(df_not_recommended)}")


推荐(Recommended)数量: 33355
不推荐(Not Recommended)数量: 16645


In [10]:
import os

# 创建目标文件夹
pos_dir = "pos"
neg_dir = "neg"
os.makedirs(pos_dir, exist_ok=True)
os.makedirs(neg_dir, exist_ok=True)


df_recommended_5k = df_recommended.head(12500)
df_not_recommended_5k = df_not_recommended.head(12500)
print(len(df_recommended_5k))
print(len(df_not_recommended_5k))
for idx, row in df_recommended_5k.iterrows():
    with open(os.path.join(pos_dir, f"{idx}.txt"), "w", encoding="utf-8") as f:
        f.write(str(row['review']))

# 保存不推荐评论到neg文件夹
for idx, row in df_not_recommended_5k.iterrows():
    with open(os.path.join(neg_dir, f"{idx}.txt"), "w", encoding="utf-8") as f:
        f.write(str(row['review']))


12500
12500


In [11]:
# 新建 test_pos 和 test_neg 文件夹，分别保存20000条之后的推荐/不推荐评论各5000条
test_pos_dir = "test_pos"
test_neg_dir = "test_neg"
os.makedirs(test_pos_dir, exist_ok=True)
os.makedirs(test_neg_dir, exist_ok=True)

# 取20000条之后的数据
df_after_20k = df.iloc[50000:]

# 分别筛选推荐和不推荐
df_test_recommended = df_after_20k[df_after_20k['recommendation'] == 'Recommended'].head(10000)
df_test_not_recommended = df_after_20k[df_after_20k['recommendation'] != 'Recommended'].head(10000)

# 保存推荐评论到 test_pos
for idx, row in df_test_recommended.iterrows():
    with open(os.path.join(test_pos_dir, f"{idx}.txt"), "w", encoding="utf-8") as f:
        f.write(str(row['review']))

# 保存不推荐评论到 test_neg
for idx, row in df_test_not_recommended.iterrows():
    with open(os.path.join(test_neg_dir, f"{idx}.txt"), "w", encoding="utf-8") as f:
        f.write(str(row['review']))


In [12]:
import re
import os

def clean_text(text):
    # 转小写
    text = text.lower()
    # 去除 HTML 标签
    text = re.sub(r'<[^>]+>', ' ', text)
    # 去除 URL
    text = re.sub(r'http[s]?://\S+', ' ', text)
    # 去除非字母数字字符（保留空格）
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # 合并多余空白并去除首尾空白
    text = re.sub(r'\s+', ' ', text).strip()
    return text

pos_dir = "pos"
neg_dir = "neg"
test_pos_dir = "test_pos"
test_neg_dir = "test_neg"

for directory in [pos_dir, neg_dir, test_pos_dir, test_neg_dir]:
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        # 读取原始内容
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        # 清洗
        cleaned = clean_text(content)
        # 覆写回文件
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(cleaned)


In [4]:
# Rename all files in pos and neg directories to pos0.txt, pos1.txt, … and neg0.txt, neg1.txt, …
# Process pos directory
import os
pos_dir="pos"
neg_dir="neg"
pos_files = sorted(os.listdir(pos_dir))
for i, filename in enumerate(pos_files):
    old_path = os.path.join(pos_dir, filename)
    new_name = f"pos{i}.txt"
    new_path = os.path.join(pos_dir, new_name)
    os.rename(old_path, new_path)

# Process neg directory
neg_files = sorted(os.listdir(neg_dir))
for i, filename in enumerate(neg_files):
    old_path = os.path.join(neg_dir, filename)
    new_name = f"neg{i}.txt"
    new_path = os.path.join(neg_dir, new_name)
    os.rename(old_path, new_path)


In [5]:
import os

neg_dir = "neg"
neg_files = sorted(os.listdir(neg_dir))
total = len(neg_files)
# 保留前 1/4，去掉后 3/4
keep_count = total // 4
for filename in neg_files[keep_count:]:
    os.remove(os.path.join(neg_dir, filename))


In [6]:
import os

test_neg_dir = "test_neg"
test_neg_files = sorted(os.listdir(test_neg_dir))
total = len(test_neg_files)
# 保留前 1/4，删除后 3/4
keep_count = total // 4
for filename in test_neg_files[keep_count:]:
    os.remove(os.path.join(test_neg_dir, filename))


In [None]:
import pandas as pd


df = pd.read_csv('archive/steam_game_reviews.csv', low_memory=False).head(100000)

# 将可能包含非数字字符的列强制转换为数值，无法转换的置为 NaN
df['hours_played'] = pd.to_numeric(df['hours_played'], errors='coerce')
df['helpful']     = pd.to_numeric(df['helpful'],     errors='coerce')

# 计算均值（默认跳过 NaN）
mean_hours   = df['hours_played'].median()
mean_helpful = df['helpful'].median()

# 打印结果
print(f"Average hours_played: {mean_hours}")
print(f"Average helpful: {mean_helpful}")


Average hours_played: 73.0
Average helpful: 3.0


In [9]:
count = df[(df['hours_played'] > 60) & (df['helpful'] > 15)].shape[0]
print(f"Number of reviews with hours_played > 50 and helpful > 15: {count}")


Number of reviews with hours_played > 50 and helpful > 15: 8758


In [14]:
# 筛选出 hours_played > 60 且 helpful > 15 的子集
filtered = df[(df['hours_played'] > 60) & (df['helpful'] > 15)]

# 统计 recommendation 列中等于 "Recommended" 的数量
rec_count = (filtered['recommendation'] == 'Recommended').sum()


print(f"Number of recommended reviews: {rec_count}")
print(f"Number of not recommended reviews: {8758-rec_count}")


Number of recommended reviews: 5374
Number of not recommended reviews: 3384


In [16]:
filtered.to_csv("Data/HQ_Data.csv")

In [34]:
import os
import re
def clean_text(text):
    # 转小写
    text = text.lower()
    # 去除 HTML 标签
    text = re.sub(r'<[^>]+>', ' ', text)
    # 去除 URL
    text = re.sub(r'http[s]?://\S+', ' ', text)
    # 去除非字母数字字符（保留空格）
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # 合并多余空白并去除首尾空白
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Create directories for positive and negative samples
os.makedirs("test_pos", exist_ok=True)
os.makedirs("test_neg", exist_ok=True)

# Split filtered data into positive (Recommended) and negative (Not Recommended)
pos_df = filtered[filtered["recommendation"] == "Recommended"]
neg_df = filtered[filtered["recommendation"] != "Recommended"]

# Save each record as a separate .txt file, only keep the review field
for idx, row in pos_df.iterrows():
    file_path = os.path.join("test_pos", f"{idx}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(clean_text(str(row['review'])) + "\n")

for idx, row in neg_df.iterrows():
    file_path = os.path.join("test_neg", f"{idx}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(clean_text(str(row['review'])) + "\n")



In [30]:

df_chunk = pd.read_csv(
    'archive/steam_game_reviews.csv',
    skiprows=range(1, 300001),  
    nrows=500000
)

df_filtered_chunk = df_chunk[
    (pd.to_numeric(df_chunk['hours_played'],errors='coerce') > 60) &
    (pd.to_numeric(df_chunk['helpful'], errors='coerce') > 15)
]
# 统计正向和负向评论数量
pos_count = df_filtered_chunk[df_filtered_chunk['recommendation'] == 'Recommended'].shape[0]
neg_count = df_filtered_chunk[df_filtered_chunk['recommendation'] != 'Recommended'].shape[0]

print(f"Number of positive reviews: {pos_count}")
print(f"Number of negative reviews: {neg_count}")


Number of positive reviews: 22562
Number of negative reviews: 6602


In [31]:
df_filtered_chunk.to_csv("Data/Training_Data")

In [38]:
pos_dir="pos"
neg_dir="neg"
pos_files = sorted(os.listdir(pos_dir))
for i, filename in enumerate(pos_files):
    old_path = os.path.join(pos_dir, filename)
    new_name = f"pos{i}.txt"
    new_path = os.path.join(pos_dir, new_name)
    os.rename(old_path, new_path)

# Process neg directory
neg_files = sorted(os.listdir(neg_dir))
for i, filename in enumerate(neg_files):
    old_path = os.path.join(neg_dir, filename)
    new_name = f"neg{i}.txt"
    new_path = os.path.join(neg_dir, new_name)
    os.rename(old_path, new_path)