In [None]:
################### QUALITY SCORING #########################
import pandas as pd
import sys
sys.path.append('../src')
import  quality_scorer as qs

## load processed df
df = pd.read_csv('processed_df.csv')

## listing feature correlation analysis
analysis_df, correlation_results = qs.comprehensive_factor_correlation_analysis(df)

## pricing competiveness correlation analysis
analysis_df_price, correlation_results_price=qs.comprehensive_price_correlation_analysis(df)

## weight scoring
print("ðŸš€ start weight analysis")
print("="*80)
print("\ calculate weight...")
weights = qs.calculate_dynamic_weights(correlation_results_price, correlation_results)
print("\nðŸ“Š weight result")
print("="*80)
sorted_weights = sorted(weights.items(), key=lambda x: x[1], reverse=True)

for dimension, weight in sorted_weights:
    print(f"  {dimension:<40}: {weight:>6.2f}%")

print("-"*80)
print(f"  sum{'':<35}: {sum(weights.values()):>6.2f}%")

## calculate score
## Create a combined DataFrame with all metrics
combined_df = df.copy()

## Add price analysis metrics
if 'price_percentile' in analysis_df_price.columns:
    combined_df['price_percentile'] = analysis_df_price['price_percentile']

## Add factor analysis metrics
for col in analysis_df.columns:
    if col not in combined_df.columns and col in analysis_df.columns:
        combined_df[col] = analysis_df[col]

## Calculate listing quality scores
print("Calculating listing quality scores...")
scored_df = qs.calculate_listing_quality_score(combined_df, weights)

## Display comprehensive summary
print("\n" + "="*80)
print("ðŸ“ˆ COMPREHENSIVE LISTING QUALITY REPORT")
print("="*80)

## Overall statistics
print(f"\nðŸ“Š OVERALL STATISTICS:")
print(f"  Total products analyzed: {len(scored_df):,}")
print(f"  Average quality score: {scored_df['quality_score'].mean():.2f}")
print(f"  Median quality score: {scored_df['quality_score'].median():.2f}")
print(f"  Score range: {scored_df['quality_score'].min():.2f} - {scored_df['quality_score'].max():.2f}")
print(f"  Standard deviation: {scored_df['quality_score'].std():.2f}")

## Score distribution
print(f"\nðŸ“ˆ SCORE DISTRIBUTION:")
score_bins = {
        'Excellent (81-100)': (81, 100),
        'Good (61-80)': (61, 80),
        'Fair (41-60)': (41, 60),
        'Poor (21-40)': (21, 40),
        'Very Poor (0-20)': (0, 20)
    }

for label, (low, high) in score_bins.items():
    if high == 100:
        count = (scored_df['quality_score'] >= low).sum()
    else:
        count = ((scored_df['quality_score'] >= low) & (scored_df['quality_score'] < high)).sum()

    percentage = (count / len(scored_df)) * 100
    print(f"  {label:<20}: {count:>6} products ({percentage:>5.1f}%)")

##################### Listing Segmentation #########################
## get low quality high potential listing
low_quality_df, summary = qs.find_low_quality_high_potential_listings(scored_df)

## get median quality with easy fix listing
title_problem_df, summary_title_problem = qs.find_title_problem_listings(scored_df)

## quantify opportunity size
results = qs.analyze_quality_improvement_impact(
    scored_df=scored_df,
    quality_score_col='quality_score',
    sales_rank_col='sales_rank',
    sold_quantity_col='sold_quantity',
    category_id_col='category_id',
    price_col='price'
)

## Access the improvement table
improvement_table = results['improvement_table_formatted']
print(improvement_table)

##  END OF ANALYSIS, LOAD DATA
scored_df.to_csv('scored_df.csv', index=False)


