dataset processing for component1

In [None]:
import pandas as pd
import re
from pathlib import Path

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def merge_review_data():
    review_detail_df = pd.read_csv('game_review.csv')
    print(f"game_review.csv loaded successfully, total reviews: {len(review_detail_df)}")

    review_stats_df = pd.read_csv('game_review_800.csv')
    print(f"game_review_800.csv loaded successfully, total games: {len(review_stats_df)}")
    matched_games_df = pd.read_csv('games.csv')
    print(f"games.csv loaded successfully, total games: {len(matched_games_df)}")

    print("\nFiltering reviews based on game_review_800.csv...")

    target_app_ids = set(review_stats_df['app_id'].tolist())

    review_detail_df_filtered = review_detail_df[review_detail_df['app_id'].isin(target_app_ids)].copy()
    print(f"game amount after filtering: {len(target_app_ids)}")
    print(f"game review amount after filtering: {len(review_detail_df_filtered)} ")

    review_detail_df_filtered['review_text'] = review_detail_df_filtered['review_text'].apply(clean_text)

    grouped_reviews = review_detail_df_filtered.groupby('app_id')['review_text'].apply(
        lambda x: ' ||| '.join(x.tolist())
    ).reset_index()
    grouped_reviews.columns = ['app_id', 'combined_reviews']

    merged_df = pd.merge(
        review_stats_df,
        grouped_reviews,
        on='app_id',
        how='left'
    )

    merged_df = pd.merge(
        merged_df,
        matched_games_df[['app_id', 'title']],
        on='app_id',
        how='left'
    )

    columns = ['app_id', 'title'] + [col for col in merged_df.columns if col not in ['app_id', 'title']]
    merged_df = merged_df[columns]

    sentiment_df = merged_df[['app_id', 'title', 'combined_reviews', 'recommendation_rate', 'total_reviews']].copy()
    sentiment_df.rename(columns={'combined_reviews': 'review_text'}, inplace=True)

    sentiment_detailed_df = review_detail_df_filtered.copy()

    title_mapping = matched_games_df.set_index('app_id')['title'].to_dict()
    sentiment_detailed_df['title'] = sentiment_detailed_df['app_id'].map(title_mapping)

    stats_mapping = review_stats_df.set_index('app_id')
    for col in ['total_reviews', 'recommendation_rate']:
        sentiment_detailed_df[col] = sentiment_detailed_df['app_id'].map(
            stats_mapping[col] if col in stats_mapping.columns else pd.Series()
        )

    sentiment_detailed_cols = ['app_id', 'title', 'review_text', 'recommendation',
                               'hours', 'date_posted', 'username', 'total_reviews',
                               'recommendation_rate']
    sentiment_detailed_df = sentiment_detailed_df[sentiment_detailed_cols]

    merged_df.to_csv('merged_game_reviews.csv', index=False, encoding='utf-8-sig')
    print(f"merge data has been saved to merged_game_reviews.csv, total line: {len(merged_df)} ")

    sentiment_df.to_csv('sentiment_analysis_combined.csv', index=False, encoding='utf-8-sig')
    print(f"overall sentiment analysis data has been saved to sentiment_analysis_combined.csv, total line {len(sentiment_df)} ")

    sentiment_detailed_df.to_csv('sentiment_analysis_detailed.csv', index=False, encoding='utf-8-sig')
    print(f"individual sentiment analysis data has been saved to sentiment_analysis_detailed.csv, total line {len(sentiment_detailed_df)} rows")

    split_reviews_df = sentiment_detailed_df[['app_id', 'title', 'review_text', 'recommendation']].copy()

    split_reviews_df['sentiment_label'] = split_reviews_df['recommendation'].apply(
        lambda x: 1 if x == 'Recommended' else 0
    )

    split_reviews_df.to_csv('split_reviews_for_analysis.csv', index=False, encoding='utf-8-sig')
    print(f"The split comment version has been saved to split_reviews_for_analysis.csv, total {len(split_reviews_df)} lines.")

    print("\n=== Data Statistics ===")
    print(f"Number of target games (from game_review_800.csv): {len(review_stats_df)}")
    print(f"Total number of reviews after filtering: {len(review_detail_df_filtered)}")
    if len(review_stats_df) > 0:
        print(f"Average number of reviews per target game: {len(review_detail_df_filtered) / len(review_stats_df):.1f}")
    else:
        print("Average number of reviews per target game: 0")

    print("\n=== Sample Data (first 3 rows) ===")
    print(merged_df[['app_id', 'title', 'total_reviews', 'recommendation_rate']].head(3))

    print("\n=== Review Sample ===")
    if len(sentiment_detailed_df) > 0:
        sample_review = sentiment_detailed_df.iloc[0]['review_text']
        print(f"First review (first 200 characters): {sample_review[:200]}...")
    else:
        print("No review data")

    return merged_df, sentiment_df, sentiment_detailed_df

def analyze_review_lengths(df):
    if len(df) == 0:
        print("No data to analyze")
        return

    print("\n=== Review Length Analysis ===")

    df['review_length'] = df['review_text'].apply(lambda x: len(str(x).split()))

    print(f"Average review length (in words): {df['review_length'].mean():.1f}")
    print(f"Shortest review length: {df['review_length'].min()}")
    print(f"Longest review length: {df['review_length'].max()}")
    print(f"Median review length: {df['review_length'].median()}")

    length_bins = [0, 10, 50, 100, 200, 500, float('inf')]
    length_labels = ['0-10', '11-50', '51-100', '101-200', '201-500', '500+']

    df['length_category'] = pd.cut(df['review_length'], bins=length_bins, labels=length_labels)
    length_dist = df['length_category'].value_counts().sort_index()

    print("\nReview length distribution:")
    for category, count in length_dist.items():
        percentage = (count / len(df)) * 100
        print(f" {category} words: {count} reviews ({percentage:.1f}%)")

    print("\n=== Recommendation ===")
    avg_length = df['review_length'].mean()
    if avg_length > 100:
        print("Reviews are long. It is recommended to use the split version (split_reviews_for_analysis.csv) for sentiment analysis.")
        print("Analyzing each review individually can yield more accurate sentiment tendencies.")
    else:
        print("Review length is moderate. You can use either the combined or split version for analysis.")

if __name__ == "__main__":
    try:
        required_files = ['game_review.csv', 'game_review_800.csv', 'games.csv']
        missing_files = [f for f in required_files if not Path(f).exists()]

        if missing_files:
            print(f"Error: Missing files: {', '.join(missing_files)}")
            print("Please ensure all files are in the current directory.")
        else:
            merged_df, sentiment_df, detailed_df = merge_review_data()
            analyze_review_lengths(detailed_df)

            print("\n=== File Descriptions ===")
            print("1. merged_game_reviews.csv - Complete merged data (statistics and reviews for target games only)")
            print("2. sentiment_analysis_combined.csv - Overall sentiment analysis data (reviews combined per game)")
            print("3. sentiment_analysis_detailed.csv - Detailed sentiment analysis data (each review separate)")
            print("4. split_reviews_for_analysis.csv - Split review version (target game reviews only, best for sentiment analysis)")

            print("\nProcessing complete!")

    except Exception as e:
        print(f"An error occurred during processing: {str(e)}")
        import traceback
        traceback.print_exc()

code for module 1

In [None]:
import pandas as pd
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import defaultdict
import numpy as np
from tqdm import tqdm

try:
    nltk.download('vader_lexicon', quiet=True)
except:
    print("Downloading VADER sentiment lexicon...")
    nltk.download('vader_lexicon')

def load_and_prepare_data():
    print("Loading data...")

    try:
        df = pd.read_csv('split_reviews_for_analysis.csv')
        print(f"Successfully loaded {len(df)} reviews")
    except FileNotFoundError:
        print("Error: split_reviews_for_analysis.csv file not found")
        print("Please run the previous code to generate this file")
        return None

    required_cols = ['app_id', 'title', 'review_text', 'recommendation', 'sentiment_label']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Error: Missing required columns: {missing_cols}")
        return None

    print("Cleaning review text...")
    df['review_text'] = df['review_text'].astype(str).apply(
        lambda x: re.sub(r'\s+', ' ', x).strip()
    )

    df = df[df['review_text'].str.len() > 10]
    print(f"{len(df)} valid reviews retained after cleaning")

    return df

def analyze_sentiment_with_vader(text, analyzer):
    sentiment_scores = analyzer.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    normalized_score = (compound_score + 1) / 2

    if compound_score >= 0.05:
        sentiment_category = 'positive'
    elif compound_score <= -0.05:
        sentiment_category = 'negative'
    else:
        sentiment_category = 'neutral'

    return {
        'compound': compound_score,
        'normalized': normalized_score,
        'category': sentiment_category,
        'positive': sentiment_scores['pos'],
        'negative': sentiment_scores['neg'],
        'neutral': sentiment_scores['neu']
    }

def apply_recommendation_priority(row, sentiment_result):
    if row['sentiment_label'] == 1 and sentiment_result['category'] in ['negative', 'neutral']:
        adjusted_score = max(sentiment_result['normalized'], 0.7)
        adjusted_category = 'positive'

        return {
            'original_score': sentiment_result['normalized'],
            'adjusted_score': adjusted_score,
            'original_category': sentiment_result['category'],
            'adjusted_category': adjusted_category,
            'was_adjusted': True
        }
    else:
        return {
            'original_score': sentiment_result['normalized'],
            'adjusted_score': sentiment_result['normalized'],
            'original_category': sentiment_result['category'],
            'adjusted_category': sentiment_result['category'],
            'was_adjusted': False
        }

def calculate_game_scores(df):
    print("Performing sentiment analysis...")

    analyzer = SentimentIntensityAnalyzer()

    results = []
    game_stats = defaultdict(lambda: {
        'title': '',
        'reviews': [],
        'sentiment_scores': [],
        'adjusted_scores': [],
        'categories': [],
        'adjusted_categories': [],
        'adjustment_count': 0
    })

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Sentiment Analysis Progress"):
        app_id = row['app_id']
        text = row['review_text']

        if game_stats[app_id]['title'] == '':
            game_stats[app_id]['title'] = row['title']

        sentiment_result = analyze_sentiment_with_vader(text, analyzer)
        priority_adjustment = apply_recommendation_priority(row, sentiment_result)

        game_stats[app_id]['reviews'].append(text)
        game_stats[app_id]['sentiment_scores'].append(sentiment_result['normalized'])
        game_stats[app_id]['adjusted_scores'].append(priority_adjustment['adjusted_score'])
        game_stats[app_id]['categories'].append(sentiment_result['category'])
        game_stats[app_id]['adjusted_categories'].append(priority_adjustment['adjusted_category'])

        if priority_adjustment['was_adjusted']:
            game_stats[app_id]['adjustment_count'] += 1

        results.append({
            'app_id': app_id,
            'title': row['title'],
            'review_text': text[:200] + '...' if len(text) > 200 else text,
            'recommendation': row['recommendation'],
            'sentiment_label': row['sentiment_label'],
            'vader_compound': sentiment_result['compound'],
            'original_score': sentiment_result['normalized'],
            'adjusted_score': priority_adjustment['adjusted_score'],
            'original_category': sentiment_result['category'],
            'adjusted_category': priority_adjustment['adjusted_category'],
            'was_adjusted': priority_adjustment['was_adjusted']
        })

    print(f"Completed sentiment analysis for {len(results)} reviews")

    print("Calculating game scores...")
    game_scores = []

    for app_id, stats in tqdm(game_stats.items(), desc="Game Score Calculation Progress"):
        if len(stats['adjusted_scores']) == 0:
            continue

        avg_adjusted_score = np.mean(stats['adjusted_scores'])
        score_10_point = round(avg_adjusted_score * 10, 1)

        category_counts = pd.Series(stats['adjusted_categories']).value_counts()

        game_scores.append({
            'app_id': app_id,
            'title': stats['title'],
            'review_count': len(stats['reviews']),
            'avg_original_score': round(np.mean(stats['sentiment_scores']) * 10, 1),
            'avg_adjusted_score': score_10_point,
            'positive_count': category_counts.get('positive', 0),
            'negative_count': category_counts.get('negative', 0),
            'neutral_count': category_counts.get('neutral', 0),
            'adjustment_count': stats['adjustment_count'],
            'adjustment_rate': round(stats['adjustment_count'] / len(stats['reviews']) * 100, 1) if len(stats['reviews']) > 0 else 0
        })

    game_scores_df = pd.DataFrame(game_scores).sort_values('avg_adjusted_score', ascending=False)

    return pd.DataFrame(results), game_scores_df

def generate_final_output(game_scores_df):
    print("Generating final output...")

    final_scores = game_scores_df[['app_id', 'avg_adjusted_score']].copy()
    final_scores.columns = ['app_id', 'sentiment_score_10point']

    final_scores.to_csv('game_sentiment_scores.csv', index=False, encoding='utf-8-sig')
    print(f"Final scores saved to game_sentiment_scores.csv, total {len(final_scores)} games")

    return final_scores

def main():
    print("=== Game Review Sentiment Analysis System ===")
    print("Purpose: Generate a 10-point sentiment score for each game using unsupervised sentiment analysis with a recommendation-priority rule.")
    print("-" * 50)

    df = load_and_prepare_data()
    if df is None:
        return

    detailed_results, game_scores = calculate_game_scores(df)
    final_scores = generate_final_output(game_scores)

    detailed_results.to_csv('detailed_sentiment_analysis.csv', index=False, encoding='utf-8-sig')
    game_scores.to_csv('game_sentiment_analysis_summary.csv', index=False, encoding='utf-8-sig')

    print("\n=== Analysis Complete ===")
    print(f"Number of games analyzed: {len(game_scores)}")
    print(f"Number of reviews analyzed: {len(detailed_results)}")

    avg_score = game_scores['avg_adjusted_score'].mean()
    print(f"Average game sentiment score: {avg_score:.1f}/10")

    total_adjustments = detailed_results['was_adjusted'].sum()
    adjustment_rate = total_adjustments / len(detailed_results) * 100
    print(f"Recommendation-priority rule adjustments: {total_adjustments} ({adjustment_rate:.1f}% of reviews)")

    print("\n=== Top 5 Games by Sentiment Score ===")
    top_5 = game_scores.head()
    for _, row in top_5.iterrows():
        print(f"{row['title']} (ID: {row['app_id']}): {row['avg_adjusted_score']}/10")

    print("\n=== Bottom 5 Games by Sentiment Score ===")
    bottom_5 = game_scores.tail()
    for _, row in bottom_5.iterrows():
        print(f"{row['title']} (ID: {row['app_id']}): {row['avg_adjusted_score']}/10")

    print("\n=== Generated Files ===")
    print("1. game_sentiment_scores.csv - Final game sentiment scores (App ID + 10-point score)")
    print("2. detailed_sentiment_analysis.csv - Detailed sentiment analysis for each review")
    print("3. game_sentiment_analysis_summary.csv - Detailed sentiment statistics for each game")

    print("\nTip: You can directly use game_sentiment_scores.csv for subsequent analysis")

if __name__ == "__main__":
    main()