# Debug Date Parsing Issues

This notebook investigates why date values are showing up as NaT in the review data.

In [1]:
# Import required libraries
import os
import sys
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv

# Add the project root to the path
project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added {project_root} to Python path")

# Load environment variables
load_dotenv()

Added /Users/dipesh/Local-Projects/indigo-reviews-ai to Python path


True

In [ ]:
# First, try to fetch a small sample of fresh data directly from the Play Store
from src.runner import ReviewAnalysisRunner

# Initialize the runner
runner = ReviewAnalysisRunner()
runner._initialize_modules()

# Fetch a small number of recent reviews for debugging
print("\n=== Fetching fresh reviews directly from Google Play Store ===")
try:
    app_id = os.environ.get("APP_ID", "in.goindigo.android").strip()
    max_reviews = 10  # Small sample just for debugging
    
    print(f"Fetching {max_reviews} reviews for app ID: {app_id}")
    fresh_reviews_df = runner.fetch_reviews(
        app_id=app_id,
        start_date="1 month ago",  # Recent reviews
        end_date="now",
        max_reviews=max_reviews
    )
    
    # Display the fresh reviews
    if fresh_reviews_df is not None and not fresh_reviews_df.empty:
        print(f"Successfully fetched {len(fresh_reviews_df)} fresh reviews")
        print("\nColumns in the fresh data:")
        print(list(fresh_reviews_df.columns))
        
        # Focus on date-related columns
        date_cols = [col for col in fresh_reviews_df.columns if 'date' in col.lower() or 'time' in col.lower()]
        for col in date_cols:
            print(f"\n{col} column data types and values:")
            print(f"Data type: {fresh_reviews_df[col].dtype}")
            print("Sample values:")
            print(fresh_reviews_df[col].head())
        
        # Examine the raw date objects directly
        if 'date' in fresh_reviews_df.columns:
            print("\nExamining raw date objects directly:")
            for i, date_obj in enumerate(fresh_reviews_df['date'].head(5)):
                print(f"Row {i}: Type={type(date_obj)}, Value={date_obj}, Repr={repr(date_obj)}")
        
        # Display a complete row for examination
        print("\nComplete sample row for examination:")
        sample_row = fresh_reviews_df.iloc[0] if len(fresh_reviews_df) > 0 else None
        if sample_row is not None:
            for col, val in sample_row.items():
                print(f"{col}: {val} (Type: {type(val)})")
    else:
        print("No fresh reviews fetched. Using stored data for analysis.")
except Exception as e:
    print(f"Error fetching fresh reviews: {e}")
    print(f"Stack trace:")
    import traceback
    traceback.print_exc()
    print("\nContinuing with existing data...")

# Load the CSV file with a sample for testing (as fallback)
csv_path = os.path.join(project_root, 'data', 'reviews.csv')
if os.path.exists(csv_path):
    print(f"\nLoading reviews from CSV as fallback: {csv_path}")
    # Load only a sample of 1000 reviews to speed up testing
    raw_df = pd.read_csv(csv_path, low_memory=False, nrows=1000)
    print(f"Successfully loaded {len(raw_df)} sample reviews from CSV file")
else:
    print(f"CSV file not found at {csv_path}")
    if 'fresh_reviews_df' in locals() and fresh_reviews_df is not None and not fresh_reviews_df.empty:
        raw_df = fresh_reviews_df
        print("Using fresh reviews as primary dataset.")

In [None]:
# Check the columns and their types
print("DataFrame Info:")
raw_df.info()

# Focus on date-related columns
date_cols = [col for col in raw_df.columns if 'date' in col.lower() or 'time' in col.lower()]
print(f"\nPotential date columns: {date_cols}")

In [None]:
# Examine the date column
if 'date' in raw_df.columns:
    print("Sample values from 'date' column:")
    print(raw_df['date'].sample(10).tolist())
    
    # Check unique values
    unique_dates = raw_df['date'].nunique()
    print(f"\nNumber of unique date values: {unique_dates}")
    
    # Check for nulls
    null_dates = raw_df['date'].isna().sum()
    print(f"Number of null date values: {null_dates} ({null_dates/len(raw_df)*100:.2f}%)")
    
    # Try to determine the date format if it's a string
    if pd.api.types.is_object_dtype(raw_df['date']):
        non_null_dates = raw_df['date'].dropna()
        if len(non_null_dates) > 0:
            print(f"\nFirst few non-null date values:")
            print(non_null_dates.head(5).tolist())

In [None]:
# Check if timestamp column exists and might be useful
if 'timestamp' in raw_df.columns:
    print("Sample values from 'timestamp' column:")
    print(raw_df['timestamp'].sample(10).tolist())
    
    # Check unique values
    unique_timestamps = raw_df['timestamp'].nunique()
    print(f"\nNumber of unique timestamp values: {unique_timestamps}")
    
    # Check for nulls
    null_timestamps = raw_df['timestamp'].isna().sum()
    print(f"Number of null timestamp values: {null_timestamps} ({null_timestamps/len(raw_df)*100:.2f}%)")
    
    # Try different timestamp formats
    print("\nTrying different timestamp interpretations:")
    
    # First, get non-null values
    non_null_timestamps = raw_df['timestamp'].dropna()
    
    if len(non_null_timestamps) > 0:
        sample_timestamp = non_null_timestamps.iloc[0]
        
        print(f"Sample timestamp: {sample_timestamp}")
        
        # Try as Unix timestamp (seconds since epoch)
        try:
            if isinstance(sample_timestamp, (int, float)):
                unix_date = pd.to_datetime(sample_timestamp, unit='s')
                print(f"As Unix timestamp (seconds): {unix_date}")
        except Exception as e:
            print(f"Not a Unix timestamp (seconds): {e}")
            
        # Try as Unix timestamp in milliseconds
        try:
            if isinstance(sample_timestamp, (int, float)):
                unix_ms_date = pd.to_datetime(sample_timestamp, unit='ms')
                print(f"As Unix timestamp (milliseconds): {unix_ms_date}")
        except Exception as e:
            print(f"Not a Unix timestamp (milliseconds): {e}")

In [None]:
# Check if there's another potential date source
if 'repliedAt' in raw_df.columns:
    print("Sample values from 'repliedAt' column:")
    print(raw_df['repliedAt'].sample(10).tolist())
    
    # Check for nulls
    null_replied = raw_df['repliedAt'].isna().sum()
    print(f"Number of null repliedAt values: {null_replied} ({null_replied/len(raw_df)*100:.2f}%)")
    
    # If there are non-null values, check format
    non_null_replied = raw_df['repliedAt'].dropna()
    if len(non_null_replied) > 0:
        print(f"\nFirst few non-null repliedAt values:")
        print(non_null_replied.head(5).tolist())

In [None]:
# Attempt to parse dates from multiple sources
processed_df = raw_df.copy()

# Attempt 1: Parse the date column if it's a string
if 'date' in processed_df.columns and pd.api.types.is_object_dtype(processed_df['date']):
    try:
        print("Attempting to parse 'date' column as datetime...")
        processed_df['parsed_date_1'] = pd.to_datetime(processed_df['date'], errors='coerce')
        valid_count = processed_df['parsed_date_1'].notna().sum()
        print(f"Successfully parsed {valid_count} dates ({valid_count/len(processed_df)*100:.2f}%)")
    except Exception as e:
        print(f"Error parsing date column: {e}")

# Attempt 2: Parse the timestamp column as seconds since epoch
if 'timestamp' in processed_df.columns:
    try:
        print("\nAttempting to parse 'timestamp' column as seconds since epoch...")
        processed_df['parsed_date_2'] = pd.to_datetime(processed_df['timestamp'], unit='s', errors='coerce')
        valid_count = processed_df['parsed_date_2'].notna().sum()
        print(f"Successfully parsed {valid_count} dates ({valid_count/len(processed_df)*100:.2f}%)")
    except Exception as e:
        print(f"Error parsing timestamp column (seconds): {e}")
        
# Attempt 3: Parse the timestamp column as milliseconds since epoch
if 'timestamp' in processed_df.columns:
    try:
        print("\nAttempting to parse 'timestamp' column as milliseconds since epoch...")
        processed_df['parsed_date_3'] = pd.to_datetime(processed_df['timestamp'], unit='ms', errors='coerce')
        valid_count = processed_df['parsed_date_3'].notna().sum()
        print(f"Successfully parsed {valid_count} dates ({valid_count/len(processed_df)*100:.2f}%)")
    except Exception as e:
        print(f"Error parsing timestamp column (milliseconds): {e}")
        
# Show the results
date_cols = [col for col in processed_df.columns if col.startswith('parsed_date_')]
if date_cols:
    print("\nSample of parsed dates:")
    processed_df[date_cols].head(10)

In [None]:
# Try custom parsers for string date formats
if 'date' in processed_df.columns and pd.api.types.is_object_dtype(processed_df['date']):
    # Get sample of non-null dates
    non_null_dates = processed_df['date'].dropna()
    
    if len(non_null_dates) > 0:
        print("Testing custom date formats on sample values...")
        sample_dates = non_null_dates.sample(min(5, len(non_null_dates))).tolist()
        
        # Common date formats to try
        formats = [
            '%Y-%m-%d',            # 2023-01-31
            '%Y/%m/%d',            # 2023/01/31
            '%d-%m-%Y',            # 31-01-2023
            '%d/%m/%Y',            # 31/01/2023
            '%m-%d-%Y',            # 01-31-2023
            '%m/%d/%Y',            # 01/31/2023
            '%Y-%m-%d %H:%M:%S',   # 2023-01-31 14:30:45
            '%d-%m-%Y %H:%M:%S',   # 31-01-2023 14:30:45
            '%m-%d-%Y %H:%M:%S',   # 01-31-2023 14:30:45
            '%b %d, %Y',           # Jan 31, 2023
            '%B %d, %Y',           # January 31, 2023
            '%d %b %Y',            # 31 Jan 2023
            '%d %B %Y',            # 31 January 2023
            '%Y%m%d'               # 20230131
        ]
        
        print(f"Sample dates: {sample_dates}")
        
        for date_format in formats:
            successful = 0
            for sample_date in sample_dates:
                if isinstance(sample_date, str):
                    try:
                        parsed = datetime.strptime(sample_date, date_format)
                        successful += 1
                    except ValueError:
                        pass
            
            if successful > 0:
                print(f"Format '{date_format}' worked for {successful}/{len(sample_dates)} samples")
                
                # Try applying this format to the whole dataset
                format_name = date_format.replace('%', '').replace(':', '').replace(' ', '_')
                col_name = f'parsed_date_{format_name}'
                
                try:
                    processed_df[col_name] = pd.to_datetime(processed_df['date'], format=date_format, errors='coerce')
                    valid_count = processed_df[col_name].notna().sum()
                    print(f"  Successfully parsed {valid_count} dates ({valid_count/len(processed_df)*100:.2f}%) using {date_format}")
                except Exception as e:
                    print(f"  Error applying format {date_format}: {e}")
        
        print("\nChecking if any custom parsers worked well:")
        custom_date_cols = [col for col in processed_df.columns if col.startswith('parsed_date_') and col not in date_cols]
        
        for col in custom_date_cols:
            valid_count = processed_df[col].notna().sum()
            if valid_count > 0:
                print(f"{col}: {valid_count} valid dates ({valid_count/len(processed_df)*100:.2f}%)")
                print(processed_df[col].head(5))

In [None]:
# Examine some review records with version info to see if there's any correlation
if 'version' in raw_df.columns:
    print("Examining records with version info:")
    version_groups = raw_df.groupby('version').size().reset_index(name='count')
    version_groups = version_groups.sort_values('count', ascending=False)
    print(version_groups.head(10))
    
    # Check a few specific versions to see if they have date info
    for version in version_groups['version'].head(3):
        if pd.notna(version):
            print(f"\nSample records for version {version}:")
            version_sample = raw_df[raw_df['version'] == version].sample(min(3, len(raw_df[raw_df['version'] == version])))
            print(version_sample[['review_id', 'date', 'timestamp', 'version']].to_string())

In [ ]:
# Examine the raw API data to check date structures
print("\n=== EXAMINING RAW API DATA ===")

def inspect_acquisition_module():
    """Access the raw API data from the acquisition module to examine date handling"""
    if hasattr(runner, 'acquisition') and runner.acquisition is not None:
        print("Accessing acquisition module directly...")
        
        try:
            # Try to get app info and raw reviews from API
            app_info = runner.acquisition.get_app_info()
            print(f"Retrieved app info: {app_info}")
            
            # Make a direct call to the API
            print("\nMaking a direct call to fetch 5 raw reviews...")
            raw_reviews = runner.acquisition._fetch_reviews(
                app_id=os.environ.get("APP_ID", "in.goindigo.android").strip(),
                lang="en",
                country="us",
                sort="newest",
                count=5
            )
            
            if raw_reviews:
                print(f"Successfully fetched {len(raw_reviews)} raw reviews")
                
                # Examine the raw structure
                for i, review in enumerate(raw_reviews):
                    print(f"\nReview {i+1}:")
                    # Print all fields of interest
                    print(f"  Review ID: {review.get('reviewId', 'N/A')}")
                    print(f"  Author: {review.get('userName', 'N/A')}")
                    print(f"  Rating: {review.get('score', 'N/A')}")
                    
                    # Especially focus on date-related fields
                    print(f"  Raw date field: {review.get('at', 'N/A')} (Type: {type(review.get('at', None))})")
                    if 'at' in review:
                        print(f"  Raw date field representation: {repr(review['at'])}")
                        
                    # Check for other possible date fields
                    time_millis = review.get('reviewCreatedVersion', 'N/A')
                    print(f"  reviewCreatedVersion: {time_millis} (Type: {type(time_millis)})")
                    
                    # Look for timestamp or unix time
                    unix_time = review.get('timeMillis', 'N/A')
                    print(f"  timeMillis: {unix_time} (Type: {type(unix_time)})")
                    
                    if isinstance(unix_time, (int, float)):
                        # Try different interpretations
                        try:
                            seconds_date = pd.to_datetime(unix_time, unit='s')
                            print(f"  As seconds since epoch: {seconds_date}")
                        except Exception as e:
                            print(f"  Not a valid seconds timestamp: {e}")
                            
                        try:
                            millis_date = pd.to_datetime(unix_time, unit='ms')
                            print(f"  As milliseconds since epoch: {millis_date}")
                        except Exception as e:
                            print(f"  Not a valid milliseconds timestamp: {e}")
                    
                # Print a complete example to see the full structure
                print("\nComplete structure of one review (first):")
                import json
                if raw_reviews:
                    print(json.dumps(raw_reviews[0], indent=2, default=str))
            else:
                print("No raw reviews were returned")
                
            # Check how the acquisition module transforms the reviews
            print("\nExamining the acquisition module's transformation process...")
            if hasattr(runner.acquisition, '_transform_review'):
                # Get a raw review
                raw_review = raw_reviews[0] if raw_reviews else None
                
                if raw_review:
                    print("Transforming a raw review...")
                    try:
                        # Call the transform method directly
                        transformed = runner.acquisition._transform_review(raw_review)
                        print("\nTransformed review:")
                        for key, value in transformed.items():
                            print(f"  {key}: {value} (Type: {type(value)})")
                            
                        # Specifically examine date transformation
                        if 'date' in transformed:
                            print("\nFocus on date transformation:")
                            print(f"  Original 'at': {raw_review.get('at', 'N/A')} (Type: {type(raw_review.get('at', None))})")
                            print(f"  Original 'timeMillis': {raw_review.get('timeMillis', 'N/A')} (Type: {type(raw_review.get('timeMillis', None))})")
                            print(f"  Transformed 'date': {transformed.get('date', 'N/A')} (Type: {type(transformed.get('date', None))})")
                    except Exception as e:
                        print(f"Error during transformation: {e}")
                        import traceback
                        traceback.print_exc()
                else:
                    print("No raw review available for transformation test")
            else:
                print("Transformation method not found in acquisition module")
        except Exception as e:
            print(f"Error inspecting acquisition module: {e}")
            import traceback
            traceback.print_exc()
    else:
        print("Acquisition module not initialized or available")

# Run the acquisition inspection        
inspect_acquisition_module()

In [ ]:
# Check date handling in our source code
print("\n=== EXAMINING SOURCE CODE DATE HANDLING ===")
try:
    # Try to find the specific code in acquisition module that handles dates
    source_file = os.path.join(project_root, 'src', 'modules', 'acquisition', 'google_play.py')
    
    if os.path.exists(source_file):
        print(f"Looking for date handling in {source_file}")
        with open(source_file, 'r') as f:
            source_code = f.read()
            
        # Find lines that might be related to date parsing
        import re
        date_patterns = [
            r'.*date.*=.*',
            r'.*time.*=.*',
            r'.*at.*=.*',
            r'.*parse.*',
            r'.*datetime.*',
            r'.*pd\.to_datetime.*'
        ]
        
        print("Lines potentially related to date handling:")
        found_lines = []
        for pattern in date_patterns:
            matches = re.findall(pattern, source_code, re.IGNORECASE)
            for match in matches:
                match = match.strip()
                if match and match not in found_lines and not match.startswith('#'):
                    found_lines.append(match)
                    
        # Sort lines to group related code
        found_lines.sort()
        for line in found_lines:
            print(f"  {line}")
    else:
        print(f"Source file not found: {source_file}")
except Exception as e:
    print(f"Error examining source code: {e}")
    
# Suggest a solution
print("\n=== SUGGESTED SOLUTIONS ===")
print("Based on the investigation, here are potential solutions for the date parsing issue:")
print("1. Check if the 'at' field from the API is properly converted to a datetime object")
print("2. Try using 'timeMillis' from the raw API response as a milliseconds timestamp")
print("3. Add explicit error handling in the date parsing code to better diagnose issues")
print("4. Examine preprocessing hooks to ensure dates aren't being dropped during cleaning")
print("5. Add a backup date parsing strategy if the primary method fails")

# Prototype a potential fix
print("\n=== PROTOTYPE FIX ===")
print("Here's a potential fix for the date parsing issue:")
print("```python")
print("# In the _transform_review method in google_play.py")
print("def _transform_review(self, review):")
print("    # ... existing code ... ")
print("    # Fix date parsing issues")
print("    at_date = None")
print("    try:")
print("        # First try to parse the 'at' field if it exists")
print("        if 'at' in review and review['at']:")
print("            at_date = pd.to_datetime(review['at'])")
print("    except Exception as e:")
print("        print(f\"Warning: Could not parse 'at' field: {e}\")")
print("    ")
print("    # If 'at' parsing failed, try timeMillis as a backup")
print("    if at_date is None and 'timeMillis' in review:")
print("        try:")
print("            # Try parsing as milliseconds since epoch")
print("            at_date = pd.to_datetime(review['timeMillis'], unit='ms')")
print("        except Exception as e:")
print("            print(f\"Warning: Could not parse 'timeMillis' field: {e}\")")
print("    ")
print("    transformed_review['date'] = at_date")
print("    # ... rest of the method ... ")
print("```")

In [ ]:
# Test the proposed fix directly in the notebook
print("\n=== TESTING PROPOSED FIX ===")

def test_fix_with_mock_data():
    """Test our fix with mock API response data"""
    # Create mock review data similar to what we'd get from the API
    mock_reviews = [
        {
            'reviewId': 'mock_review_1',
            'userName': 'Test User 1',
            'score': 4,
            'at': '2023-05-01T14:30:45Z',  # Standard ISO format
            'timeMillis': 1682951445000,   # Same date in milliseconds
        },
        {
            'reviewId': 'mock_review_2',
            'userName': 'Test User 2',
            'score': 3,
            'at': 'May 15, 2023',          # Different format that might fail
            'timeMillis': 1684159845000,   # May 15, 2023
        },
        {
            'reviewId': 'mock_review_3',
            'userName': 'Test User 3',
            'score': 5,
            'at': None,                    # Missing 'at' field
            'timeMillis': 1686751845000,   # June 14, 2023
        },
        {
            'reviewId': 'mock_review_4',
            'userName': 'Test User 4',
            'score': 2,
            'at': 'Invalid date string',   # Invalid format
            'timeMillis': 1689343845000,   # July 14, 2023
        },
        {
            'reviewId': 'mock_review_5',
            'userName': 'Test User 5',
            'score': 1,
            'at': '2023-08-14T10:15:30Z',  # Standard ISO format
            'timeMillis': None,            # Missing timeMillis
        }
    ]
    
    print(f"Testing with {len(mock_reviews)} mock reviews")
    
    # Original processing approach (simulated)
    original_results = []
    for review in mock_reviews:
        try:
            # Simulate the current approach that might be failing
            if 'at' in review and review['at']:
                date = pd.to_datetime(review['at'])
            else:
                date = None
                
            original_results.append({
                'review_id': review['reviewId'],
                'date': date,
                'rating': review['score']
            })
        except Exception as e:
            original_results.append({
                'review_id': review['reviewId'],
                'date': None,  # Date parsing failed
                'rating': review['score']
            })
            print(f"Original approach failed for {review['reviewId']}: {e}")
    
    # New approach with the fix
    fixed_results = []
    for review in mock_reviews:
        at_date = None
        try:
            # First try to parse the 'at' field if it exists
            if 'at' in review and review['at']:
                at_date = pd.to_datetime(review['at'])
        except Exception as e:
            print(f"Warning: Could not parse 'at' field for {review['reviewId']}: {e}")
        
        # If 'at' parsing failed, try timeMillis as a backup
        if at_date is None and 'timeMillis' in review and review['timeMillis']:
            try:
                # Try parsing as milliseconds since epoch
                at_date = pd.to_datetime(review['timeMillis'], unit='ms')
            except Exception as e:
                print(f"Warning: Could not parse 'timeMillis' field for {review['reviewId']}: {e}")
        
        fixed_results.append({
            'review_id': review['reviewId'],
            'date': at_date,
            'rating': review['score']
        })
    
    # Compare results
    print("\nResults comparison:")
    print("| Review ID    | Original Date          | Fixed Date             |")
    print("|--------------|------------------------|------------------------|")
    for orig, fixed in zip(original_results, fixed_results):
        orig_date = str(orig['date']) if orig['date'] is not None else "None"
        fixed_date = str(fixed['date']) if fixed['date'] is not None else "None"
        print(f"| {orig['review_id']:<12} | {orig_date:<22} | {fixed_date:<22} |")
    
    # Count successful parses
    orig_success = sum(1 for r in original_results if r['date'] is not None)
    fixed_success = sum(1 for r in fixed_results if r['date'] is not None)
    
    print(f"\nOriginal approach: {orig_success}/{len(mock_reviews)} successful date parses ({orig_success/len(mock_reviews)*100:.1f}%)")
    print(f"Fixed approach: {fixed_success}/{len(mock_reviews)} successful date parses ({fixed_success/len(mock_reviews)*100:.1f}%)")
    
    # Create DataFrames for comparison
    orig_df = pd.DataFrame(original_results)
    fixed_df = pd.DataFrame(fixed_results)
    
    print("\nOriginal DataFrame:")
    print(orig_df)
    
    print("\nFixed DataFrame:")
    print(fixed_df)
    
    return orig_df, fixed_df

# Run the test with mock data
orig_df, fixed_df = test_fix_with_mock_data()

# Try with real data if it was fetched
if 'fresh_reviews_df' in locals() and fresh_reviews_df is not None and not fresh_reviews_df.empty:
    print("\n=== TESTING WITH REAL DATA ===")
    
    # Get raw reviews directly from acquisition module if possible
    try:
        if hasattr(runner, 'acquisition') and runner.acquisition is not None:
            # Make a direct call to the API again
            raw_reviews = runner.acquisition._fetch_reviews(
                app_id=os.environ.get("APP_ID", "in.goindigo.android").strip(),
                lang="en",
                country="us",
                sort="newest",
                count=5
            )
            
            if raw_reviews:
                print(f"Testing the fix with {len(raw_reviews)} real reviews")
                
                # Create DataFrame with the original approach
                original_dates = []
                for review in raw_reviews:
                    try:
                        if 'at' in review and review['at']:
                            date = pd.to_datetime(review['at'])
                        else:
                            date = None
                    except Exception:
                        date = None
                    
                    original_dates.append({
                        'review_id': review.get('reviewId', 'unknown'),
                        'date': date
                    })
                
                # Apply the fix
                fixed_dates = []
                for review in raw_reviews:
                    at_date = None
                    try:
                        if 'at' in review and review['at']:
                            at_date = pd.to_datetime(review['at'])
                    except Exception:
                        at_date = None
                    
                    if at_date is None and 'timeMillis' in review and review['timeMillis']:
                        try:
                            at_date = pd.to_datetime(review['timeMillis'], unit='ms')
                        except Exception:
                            at_date = None
                    
                    fixed_dates.append({
                        'review_id': review.get('reviewId', 'unknown'),
                        'date': at_date
                    })
                
                # Compare results
                print("\nResults with real data:")
                print("| Review ID                | Original Date          | Fixed Date             |")
                print("|--------------------------|------------------------|------------------------|")
                for orig, fixed in zip(original_dates, fixed_dates):
                    orig_date = str(orig['date']) if orig['date'] is not None else "None"
                    fixed_date = str(fixed['date']) if fixed['date'] is not None else "None"
                    print(f"| {orig['review_id']:<24} | {orig_date:<22} | {fixed_date:<22} |")
                
                # Count successful parses
                orig_success = sum(1 for r in original_dates if r['date'] is not None)
                fixed_success = sum(1 for r in fixed_dates if r['date'] is not None)
                
                print(f"\nOriginal approach: {orig_success}/{len(raw_reviews)} successful date parses ({orig_success/len(raw_reviews)*100:.1f}%)")
                print(f"Fixed approach: {fixed_success}/{len(raw_reviews)} successful date parses ({fixed_success/len(raw_reviews)*100:.1f}%)")
            else:
                print("No raw reviews available for testing with real data")
    except Exception as e:
        print(f"Error testing with real data: {e}")
else:
    print("\nNo fresh reviews data available for testing the fix with real data")

# Final recommendations
print("\n=== FINAL RECOMMENDATIONS ===")
print("Based on the testing, the recommended approach is:")
print("1. Modify the _transform_review method in google_play.py to implement the date parsing fix")
print("2. Add explicit logging for date parsing failures to help diagnose issues")
print("3. Add a unit test specifically for date parsing to ensure it works consistently")
print("4. Consider using the timeMillis field as the primary date source since it's more reliable")

In [None]:
# Summary of findings
print("SUMMARY OF DATE DEBUGGING:")
print("==========================")

if 'date' in raw_df.columns:
    null_dates = raw_df['date'].isna().sum()
    print(f"Date column: {null_dates}/{len(raw_df)} null values ({null_dates/len(raw_df)*100:.2f}%)")
    if pd.api.types.is_object_dtype(raw_df['date']):
        print("  Data type: string/object")
    else:
        print(f"  Data type: {raw_df['date'].dtype}")

if 'timestamp' in raw_df.columns:
    null_timestamps = raw_df['timestamp'].isna().sum()
    print(f"Timestamp column: {null_timestamps}/{len(raw_df)} null values ({null_timestamps/len(raw_df)*100:.2f}%)")
    print(f"  Data type: {raw_df['timestamp'].dtype}")

# Best parsed date column
all_date_cols = [col for col in processed_df.columns if col.startswith('parsed_date_')]
if all_date_cols:
    best_col = None
    best_valid_count = 0
    
    for col in all_date_cols:
        valid_count = processed_df[col].notna().sum()
        if valid_count > best_valid_count:
            best_valid_count = valid_count
            best_col = col
    
    if best_col:
        print(f"\nBest parsed date column: {best_col}")
        print(f"  Valid dates: {best_valid_count}/{len(processed_df)} ({best_valid_count/len(processed_df)*100:.2f}%)")
        if best_valid_count > 0:
            print("  Date range:")
            print(f"    Min: {processed_df[best_col].min()}")
            print(f"    Max: {processed_df[best_col].max()}")
            
            # Sample of valid dates
            valid_dates = processed_df[processed_df[best_col].notna()]
            if len(valid_dates) > 0:
                print("\n  Sample of valid dates:")
                print(valid_dates[[best_col]].head(5))
else:
    print("\nNo successful date parsing attempts")

# Recommendation
print("\nRECOMMENDATION:")
if all_date_cols and best_valid_count > 0:
    print(f"Use the {best_col} column for date information. It successfully parsed {best_valid_count}/{len(processed_df)} dates.")
    print("Update the data loading code to use this parsing approach.")
else:
    print("The date information in the dataset appears to be missing or in an unrecognized format.")
    print("Options:")
    print("1. Continue using synthetic dates for visualization purposes")
    print("2. Investigate the data source to determine the correct date format")
    print("3. Consider using other metadata (like version numbers) as a proxy for timeframes")