In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)  # Show more columns
pd.set_option('display.max_rows', 100)     # Show more rows
sns.set_style('whitegrid')                 # Pretty plots

# Print versions (for reproducibility)
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print("\n‚úÖ Libraries loaded successfully!")

# Define data directory
# CHANGE THIS to match your folder location!
DATA_DIR = Path('data/raw')  # If data is in 'data/raw' folder

# OR use full path:
# DATA_DIR = Path('/Users/yourname/sports-betting-analytics/data/raw')  # Mac
# DATA_DIR = Path('C:/Users/yourname/sports-betting-analytics/data/raw')  # Windows

# Verify directory exists
if not DATA_DIR.exists():
    print(f"‚ùå ERROR: Directory not found: {DATA_DIR}")
    print("Please update DATA_DIR to point to your data folder")
else:
    print(f"‚úÖ Data directory found: {DATA_DIR}")
    
    # List all CSV files
    csv_files = list(DATA_DIR.glob('*.csv'))
    print(f"\nüìÅ Found {len(csv_files)} CSV files:")
    for f in sorted(csv_files):
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"   - {f.name:40s} ({size_mb:6.2f} MB)")


# Load Table 1: games.csv (PRIMARY DATASET)
print("Loading: games.csv")
print("="*70)

try:
    games = pd.read_csv(DATA_DIR / 'games.csv')
    
    print(f"‚úÖ Loaded successfully!")
    print(f"\nüìä Dimensions: {games.shape[0]:,} rows √ó {games.shape[1]} columns")
    print(f"\nüìÖ Date range: {games['GAME_DATE_EST'].min()} to {games['GAME_DATE_EST'].max()}")
    
    print(f"\nüìã Columns:")
    print(games.columns.tolist())
    
    print(f"\nüîç First 3 rows:")
    display(games.head(3))
    
    print(f"\nüìà Basic Statistics:")
    print(games[['PTS_home', 'PTS_away', 'FG_PCT_home', 'FG_PCT_away']].describe())
    
except FileNotFoundError:
    print("‚ùå File not found! Check filename: should be 'games.csv'")
except Exception as e:
    print(f"‚ùå Error loading file: {e}")


# Check for missing values
print("‚ö†Ô∏è  MISSING VALUES CHECK - games table")
print("="*70)

missing = games.isnull().sum()
missing_pct = (missing / len(games)) * 100

missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing_Count': missing.values,
    'Percentage': missing_pct.values
})

# Only show columns with missing values
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(f"\n‚ùå Found {len(missing_df)} columns with missing values:\n")
    display(missing_df)
else:
    print("\n‚úÖ No missing values in games table!")

# Total missing
total_cells = games.shape[0] * games.shape[1]
total_missing = games.isnull().sum().sum()
print(f"\nTotal missing cells: {total_missing:,} out of {total_cells:,} ({total_missing/total_cells*100:.2f}%)")


# Quality checks for games table
print("üîç GAMES TABLE - QUALITY CHECKS")
print("="*70)

# Check 1: Score ranges (should be 50-150 typically)
print("\n1Ô∏è‚É£ Score Statistics:")
print(f"   Home scores: {games['PTS_home'].min()} to {games['PTS_home'].max()} (avg: {games['PTS_home'].mean():.1f})")
print(f"   Away scores: {games['PTS_away'].min()} to {games['PTS_away'].max()} (avg: {games['PTS_away'].mean():.1f})")

# Check for impossible scores
impossible_low = games[(games['PTS_home'] < 50) | (games['PTS_away'] < 50)]
impossible_high = games[(games['PTS_home'] > 200) | (games['PTS_away'] > 200)]
print(f"\n   Games with unusually low scores (<50): {len(impossible_low)}")
if len(impossible_low) > 0:
    print("   ‚ö†Ô∏è  WARNING: These might be data errors or incomplete games")
print(f"   Games with unusually high scores (>200): {len(impossible_high)}")
if len(impossible_high) > 0:
    print("   ‚ö†Ô∏è  WARNING: Likely data entry errors")

# Check 2: Shooting percentages (should be 0.0-1.0)
print("\n2Ô∏è‚É£ Shooting Percentages (should be between 0 and 1):")
pct_cols = ['FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 
            'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away']

for col in pct_cols:
    if col in games.columns:
        invalid = games[(games[col] < 0) | (games[col] > 1.0)].dropna()
        if len(invalid) > 0:
            print(f"   ‚ö†Ô∏è  {col}: {len(invalid)} invalid values (outside 0-1 range)")
        else:
            print(f"   ‚úÖ {col}: All values valid")

# Check 3: Duplicates
print("\n3Ô∏è‚É£ Duplicate Check:")
duplicates = games.duplicated(subset=['GAME_ID']).sum()
print(f"   Duplicate GAME_IDs: {duplicates}")
if duplicates > 0:
    print("   ‚ö†Ô∏è  WARNING: Same game appears multiple times - need to remove duplicates!")
else:
    print("   ‚úÖ No duplicates - each game is unique")

# Check 4: HOME_TEAM_WINS consistency
print("\n4Ô∏è‚É£ Win/Loss Logic Check:")
games['calculated_home_win'] = (games['PTS_home'] > games['PTS_away']).astype(int)
mismatches = (games['HOME_TEAM_WINS'] != games['calculated_home_win']).sum()
print(f"   Mismatches between HOME_TEAM_WINS column and actual scores: {mismatches}")
if mismatches > 0:
    print("   ‚ö†Ô∏è  WARNING: Win column doesn't match scores - data error!")
    # Show examples
    print("\n   Example mismatches:")
    display(games[games['HOME_TEAM_WINS'] != games['calculated_home_win']][
        ['GAME_DATE_EST', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away', 'HOME_TEAM_WINS']
    ].head())
else:
    print("   ‚úÖ Win column matches actual results perfectly")

print("\n" + "="*70)

# Load Table 2: games_details.csv
print("Loading: games_details.csv (Player-level stats)")
print("="*70)

try:
    games_details = pd.read_csv(DATA_DIR / 'games_details.csv')
    
    print(f"‚úÖ Loaded successfully!")
    print(f"\nüìä Dimensions: {games_details.shape[0]:,} rows √ó {games_details.shape[1]} columns")
    
    # Unique counts
    print(f"\nüî¢ Unique values:")
    print(f"   Unique games: {games_details['GAME_ID'].nunique():,}")
    print(f"   Unique players: {games_details['PLAYER_ID'].nunique():,}")
    print(f"   Unique teams: {games_details['TEAM_ID'].nunique()}")
    
    print(f"\nüîç Sample data:")
    display(games_details.head(3))
    
    # Check columns
    print(f"\nüìã Columns ({len(games_details.columns)}):")
    print(games_details.columns.tolist())
    
except FileNotFoundError:
    print("‚ùå File not found! Check filename.")
    games_details = None
except Exception as e:
    print(f"‚ùå Error: {e}")
    games_details = None


if games_details is not None:
    print("‚ö†Ô∏è  MISSING VALUES - games_details")
    print("="*70)
    
    missing = games_details.isnull().sum()
    missing_pct = (missing / len(games_details)) * 100
    
    missing_df = pd.DataFrame({
        'Column': missing.index,
        'Missing_Count': missing.values,
        'Percentage': missing_pct.values
    })
    
    # Only show columns with >1% missing
    missing_df = missing_df[missing_df['Percentage'] > 1.0].sort_values('Missing_Count', ascending=False)
    
    if len(missing_df) > 0:
        print(f"\nColumns with >1% missing values:\n")
        display(missing_df)
        
        print("\nüí° NOTES:")
        print("   - COMMENT: Expected to be NULL (only filled when player didn't play)")
        print("   - NICKNAME: Some players don't have nicknames")
        print("   - START_POSITION: NULL for bench players (expected!)")
        print("   - MIN: NULL for DNP (Did Not Play) players")
    else:
        print("\n‚úÖ No significant missing values!")


if games_details is not None:
    print("üîç PLAYER PARTICIPATION ANALYSIS")
    print("="*70)
    
    # Convert MIN to numeric (it's stored as "25:30" string format)
    # For now, just check if it's null or "0:00"
    
    # Players who didn't play
    dnp_players = games_details[
        games_details['MIN'].isna() | 
        (games_details['MIN'] == '0:00') |
        (games_details['MIN'] == '0')
    ]
    
    print(f"\nPlayers listed but Did Not Play (DNP): {len(dnp_players):,}")
    print(f"Percentage: {len(dnp_players)/len(games_details)*100:.1f}%")
    
    # Players who played
    played = games_details[
        games_details['MIN'].notna() & 
        (games_details['MIN'] != '0:00') &
        (games_details['MIN'] != '0')
    ]
    
    print(f"\nPlayers who actually played: {len(played):,}")
    print(f"Percentage: {len(played)/len(games_details)*100:.1f}%")
    
    # Average players per game
    players_per_game = games_details.groupby('GAME_ID').size()
    print(f"\nPlayers listed per game:")
    print(f"   Average: {players_per_game.mean():.1f}")
    print(f"   Min: {players_per_game.min()}")
    print(f"   Max: {players_per_game.max()}")
    print("\nüí° Typical: 24-28 players (2 teams √ó 12-14 roster spots)")
    
    # Check for games with suspiciously few players
    few_players = players_per_game[players_per_game < 20]
    if len(few_players) > 0:
        print(f"\n‚ö†Ô∏è  {len(few_players)} games have <20 players listed (unusual!)")

# Load remaining tables from Dataset 1
print("Loading remaining tables...")
print("="*70)

# teams.csv
try:
    teams = pd.read_csv(DATA_DIR / 'teams.csv')
    print(f"‚úÖ teams.csv: {len(teams)} rows √ó {teams.shape[1]} columns")
    display(teams.head(3))
except:
    print("‚ö†Ô∏è  teams.csv not found")
    teams = None

print("\n" + "-"*70 + "\n")

# ranking.csv
try:
    ranking = pd.read_csv(DATA_DIR / 'ranking.csv')
    print(f"‚úÖ ranking.csv: {len(ranking):,} rows √ó {ranking.shape[1]} columns")
    display(ranking.head(3))
except:
    print("‚ö†Ô∏è  ranking.csv not found")
    ranking = None

print("\n" + "-"*70 + "\n")

# players.csv
try:
    players = pd.read_csv(DATA_DIR / 'players.csv')
    print(f"‚úÖ players.csv: {len(players):,} rows √ó {players.shape[1]} columns")
    display(players.head(3))
except:
    print("‚ö†Ô∏è  players.csv not found")
    players = None


# Load Seasons_Stats.csv (Advanced player metrics)
print("Loading: Seasons_Stats.csv (Dataset 2 - Player advanced stats)")
print("="*70)

try:
    seasons_stats = pd.read_csv(DATA_DIR / 'Seasons_Stats.csv')
    
    print(f"‚úÖ Loaded successfully!")
    print(f"\nüìä Dimensions: {len(seasons_stats):,} rows √ó {seasons_stats.shape[1]} columns")
    print(f"\nüìÖ Years covered: {seasons_stats['Year'].min()} to {seasons_stats['Year'].max()}")
    
    print(f"\nüìã Advanced metrics available:")
    advanced_cols = ['PER', 'TS%', 'USG%', 'OWS', 'DWS', 'WS', 'VORP', 'BPM']
    available = [col for col in advanced_cols if col in seasons_stats.columns]
    print(f"   {', '.join(available)}")
    
    print(f"\nüîç Sample data:")
    display(seasons_stats[['Year', 'Player', 'Tm', 'PTS', 'AST', 'TRB']].head(3))
    
except FileNotFoundError:
    print("‚ö†Ô∏è  Seasons_Stats.csv not found (Dataset 2)")
    seasons_stats = None
except Exception as e:
    print(f"‚ùå Error: {e}")
    seasons_stats = None

# Load betting data files
print("Loading: BETTING DATA (Dataset 3)")
print("="*70)

betting_files = [
    'nba_betting_money_line.csv',
    'nba_betting_spread.csv',
    'nba_betting_totals.csv',
    'nba_games_all.csv'
]

betting_data = {}
for filename in betting_files:
    filepath = DATA_DIR / filename
    if filepath.exists():
        try:
            df = pd.read_csv(filepath)
            betting_data[filename] = df
            print(f"‚úÖ {filename}: {len(df):,} rows √ó {df.shape[1]} columns")
            
            # Show sample
            print("   Sample:")
            display(df.head(2))
            print()
            
        except Exception as e:
            print(f"‚ùå Error loading {filename}: {e}\n")
    else:
        print(f"‚ö†Ô∏è  Not found: {filename}\n")

if len(betting_data) == 0:
    print("‚ö†Ô∏è  WARNING: No betting data found!")
    print("   Betting data is CRITICAL for this project.")
    print("   Check if filenames are different in your download.")
else:
    print(f"\n‚úÖ Loaded {len(betting_data)} betting data files!")


# Check how tables connect (foreign keys)
print("üîó TABLE RELATIONSHIPS")
print("="*70)

# 1. games ‚Üí teams (HOME_TEAM_ID, VISITOR_TEAM_ID)
if teams is not None:
    print("\n1Ô∏è‚É£ games ‚Üí teams:")
    games_home_teams = set(games['HOME_TEAM_ID'].unique())
    games_away_teams = set(games['VISITOR_TEAM_ID'].unique())
    all_game_teams = games_home_teams.union(games_away_teams)
    teams_in_master = set(teams['TEAM_ID'].unique())
    
    print(f"   Teams in games: {len(all_game_teams)}")
    print(f"   Teams in teams table: {len(teams_in_master)}")
    
    # Missing teams?
    missing_teams = all_game_teams - teams_in_master
    if missing_teams:
        print(f"   ‚ö†Ô∏è  Teams in games but not in teams table: {missing_teams}")
    else:
        print(f"   ‚úÖ All teams accounted for!")

# 2. games_details ‚Üí games (GAME_ID)
if games_details is not None:
    print("\n2Ô∏è‚É£ games_details ‚Üí games:")
    games_in_details = set(games_details['GAME_ID'].unique())
    games_in_games = set(games['GAME_ID'].unique())
    
    print(f"   Unique games in games: {len(games_in_games):,}")
    print(f"   Unique games in games_details: {len(games_in_details):,}")
    
    # Perfect match?
    games_missing_details = games_in_games - games_in_details
    details_missing_games = games_in_details - games_in_games
    
    if games_missing_details:
        print(f"   ‚ö†Ô∏è  Games without player details: {len(games_missing_details):,}")
    if details_missing_games:
        print(f"   ‚ö†Ô∏è  Player details for games not in games table: {len(details_missing_games):,}")
    if not games_missing_details and not details_missing_games:
        print(f"   ‚úÖ Perfect match - all games have player details!")
    
    # Calculate JOIN success rate
    join_success_rate = (len(games_in_details.intersection(games_in_games)) / 
                         len(games_in_games)) * 100
    print(f"\n   üìä JOIN Success Rate: {join_success_rate:.1f}%")
    if join_success_rate < 95:
        print("   ‚ö†Ô∏è  Low success rate - many games missing details!")

# 3. Check if betting data links to games
if betting_data:
    print("\n3Ô∏è‚É£ betting_data ‚Üí games:")
    for filename, df in betting_data.items():
        if 'GAME_ID' in df.columns or 'game_id' in df.columns:
            game_id_col = 'GAME_ID' if 'GAME_ID' in df.columns else 'game_id'
            betting_games = set(df[game_id_col].unique())
            match_rate = (len(betting_games.intersection(games_in_games)) / 
                         len(betting_games)) * 100
            print(f"   {filename}: {match_rate:.1f}% of betting games found in main games table")



            # Create comprehensive summary
print("üìã COMPLETE DATA INVENTORY")
print("="*70)

inventory = []

# Check each table
tables_check = {
    'games': ('games', 'Game results with scores'),
    'games_details': ('games_details', 'Player stats per game'),
    'teams': ('teams', 'Team information'),
    'ranking': ('ranking', 'Team standings'),
    'players': ('players', 'Player-season mapping'),
    'seasons_stats': ('seasons_stats', 'Player advanced stats'),
}

for var_name, (display_name, description) in tables_check.items():
    if var_name in locals() and locals()[var_name] is not None:
        df = locals()[var_name]
        inventory.append({
            'Table': display_name,
            'Description': description,
            'Rows': f"{len(df):,}",
            'Columns': df.shape[1],
            'Status': '‚úÖ'
        })
    else:
        inventory.append({
            'Table': display_name,
            'Description': description,
            'Rows': 'N/A',
            'Columns': 'N/A',
            'Status': '‚ùå'
        })

inventory_df = pd.DataFrame(inventory)
display(inventory_df)

# Betting data
print(f"\nüí∞ Betting Data Files: {len(betting_data)}")
if betting_data:
    for filename, df in betting_data.items():
        print(f"   ‚úÖ {filename}: {len(df):,} rows")
else:
    print("   ‚ùå No betting data loaded")



# Generate insights and issues
print("üéØ KEY INSIGHTS & ISSUES")
print("="*70)

issues = []
insights = []

# Insights
insights.append(f"‚úÖ {len(games):,} NBA games from {games['GAME_DATE_EST'].min()} to {games['GAME_DATE_EST'].max()}")
if games_details is not None:
    insights.append(f"‚úÖ {games_details['PLAYER_ID'].nunique():,} unique players tracked")
if teams is not None:
    insights.append(f"‚úÖ {teams['TEAM_ID'].nunique()} teams in dataset")
if betting_data:
    insights.append(f"‚úÖ Betting odds available for analysis")

# Issues
if games.isnull().sum().sum() > 0:
    issues.append(f"‚ö†Ô∏è  {games.isnull().sum().sum():,} missing values in games table")

if games_details is not None and games_details.isnull().sum().sum() > 10000:
    dnp_count = len(games_details[games_details['MIN'].isna()])
    issues.append(f"‚ö†Ô∏è  {dnp_count:,} player-game records with no minutes (DNP)")

duplicates = games.duplicated(subset=['GAME_ID']).sum()
if duplicates > 0:
    issues.append(f"‚ö†Ô∏è  {duplicates} duplicate game records")

if not betting_data:
    issues.append(f"‚ùå CRITICAL: No betting data found - needed for predictions!")

# Display
print("\nüìä INSIGHTS:")
for insight in insights:
    print(f"   {insight}")

print("\n‚ö†Ô∏è  ISSUES TO ADDRESS:")
if issues:
    for issue in issues:
        print(f"   {issue}")
else:
    print("   ‚úÖ No major issues detected!")

print("\n" + "="*70)


# Final summary
print("="*70)
print("‚úÖ PHASE 1: DATA QUALITY ASSESSMENT COMPLETE!")
print("="*70)

print("\nüìä WHAT YOU HAVE:")
print(f"   ‚Ä¢ {len(games):,} NBA games")
if games_details is not None:
    print(f"   ‚Ä¢ {len(games_details):,} player-game records")
if teams is not None:
    print(f"   ‚Ä¢ {teams['TEAM_ID'].nunique()} teams")
if seasons_stats is not None:
    print(f"   ‚Ä¢ {len(seasons_stats):,} player-season statistics")
if betting_data:
    print(f"   ‚Ä¢ {len(betting_data)} betting data files")

print("\nüéØ NEXT PHASE: Data Cleaning")
print("   We'll fix:")
print("   1. Missing values (decide: drop or fill?)")
print("   2. Duplicate records")
print("   3. Data format issues")
print("   4. Merge betting odds with games")
print("   5. Create master dataset for modeling")

print("\nüí° WHAT TO DO NOW:")
print("   1. Review the output above")
print("   2. Note which issues you found")
print("   3. Reply: 'Phase 1 complete - found [describe issues]'")
print("   4. I'll create Phase 2: Data Cleaning notebook!")

print("\n" + "="*70)
print("Great job! You've successfully assessed your NBA data! üèÄ")
print("="*70)

pandas version: 2.3.3
numpy version: 2.3.5

‚úÖ Libraries loaded successfully!
‚úÖ Data directory found: data\raw

üìÅ Found 20 CSV files:
   - boxscore.csv                             ( 77.50 MB)
   - coaches.csv                              (  0.09 MB)
   - games.csv                                (  3.92 MB)
   - games_details.csv                        ( 88.77 MB)
   - injuries_2010-2020.csv                   (  1.52 MB)
   - nba_betting_money_line.csv               (  7.50 MB)
   - nba_betting_spread.csv                   (  9.42 MB)
   - nba_betting_totals.csv                   (  9.73 MB)
   - nba_games_all.csv                        ( 17.13 MB)
   - nba_players_all.csv                      (  0.62 MB)
   - nba_players_game_stats.csv               (193.72 MB)
   - nba_teams_all.csv                        (  0.00 MB)
   - play_data.csv                            (912.40 MB)
   - player_data.csv                          (  0.33 MB)
   - player_info.csv                          ( 

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-12-22,22200477,Final,1610612740,1610612759,2022,1610612740,126.0,0.484,0.926,0.382,25.0,46.0,1610612759,117.0,0.478,0.815,0.321,23.0,44.0,1
1,2022-12-22,22200478,Final,1610612762,1610612764,2022,1610612762,120.0,0.488,0.952,0.457,16.0,40.0,1610612764,112.0,0.561,0.765,0.333,20.0,37.0,1
2,2022-12-21,22200466,Final,1610612739,1610612749,2022,1610612739,114.0,0.482,0.786,0.313,22.0,37.0,1610612749,106.0,0.47,0.682,0.433,20.0,46.0,1



üìà Basic Statistics:
           PTS_home      PTS_away   FG_PCT_home   FG_PCT_away
count  26552.000000  26552.000000  26552.000000  26552.000000
mean     103.455898    100.639876      0.460735      0.449732
std       13.283370     13.435868      0.056676      0.055551
min       36.000000     33.000000      0.250000      0.244000
25%       94.000000     91.000000      0.422000      0.412000
50%      103.000000    100.000000      0.460000      0.449000
75%      112.000000    110.000000      0.500000      0.487000
max      168.000000    168.000000      0.684000      0.687000
‚ö†Ô∏è  MISSING VALUES CHECK - games table

‚ùå Found 12 columns with missing values:



Unnamed: 0,Column,Missing_Count,Percentage
7,PTS_home,99,0.371468
8,FG_PCT_home,99,0.371468
9,FT_PCT_home,99,0.371468
10,FG3_PCT_home,99,0.371468
11,AST_home,99,0.371468
12,REB_home,99,0.371468
14,PTS_away,99,0.371468
15,FG_PCT_away,99,0.371468
16,FT_PCT_away,99,0.371468
17,FG3_PCT_away,99,0.371468



Total missing cells: 1,188 out of 559,671 (0.21%)
üîç GAMES TABLE - QUALITY CHECKS

1Ô∏è‚É£ Score Statistics:
   Home scores: 36.0 to 168.0 (avg: 103.5)
   Away scores: 33.0 to 168.0 (avg: 100.6)

   Games with unusually low scores (<50): 1
   Games with unusually high scores (>200): 0

2Ô∏è‚É£ Shooting Percentages (should be between 0 and 1):
   ‚úÖ FG_PCT_home: All values valid
   ‚úÖ FT_PCT_home: All values valid
   ‚úÖ FG3_PCT_home: All values valid
   ‚úÖ FG_PCT_away: All values valid
   ‚úÖ FT_PCT_away: All values valid
   ‚úÖ FG3_PCT_away: All values valid

3Ô∏è‚É£ Duplicate Check:
   Duplicate GAME_IDs: 29

4Ô∏è‚É£ Win/Loss Logic Check:
   Mismatches between HOME_TEAM_WINS column and actual scores: 0
   ‚úÖ Win column matches actual results perfectly

Loading: games_details.csv (Player-level stats)
‚úÖ Loaded successfully!

üìä Dimensions: 668,628 rows √ó 29 columns

üî¢ Unique values:
   Unique games: 26,523
   Unique players: 2,687
   Unique teams: 30

üîç Sample data:


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,22200477,1610612759,SAS,San Antonio,1629641,Romeo Langford,Romeo,F,,18:06,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,-2.0
1,22200477,1610612759,SAS,San Antonio,1631110,Jeremy Sochan,Jeremy,F,,31:01,7.0,14.0,0.5,2.0,4.0,0.5,7.0,10.0,0.7,6.0,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0,-14.0
2,22200477,1610612759,SAS,San Antonio,1627751,Jakob Poeltl,Jakob,C,,21:42,6.0,9.0,0.667,0.0,0.0,0.0,1.0,1.0,1.0,1.0,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,-4.0



üìã Columns (29):
['GAME_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'START_POSITION', 'COMMENT', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']
‚ö†Ô∏è  MISSING VALUES - games_details

Columns with >1% missing values:



Unnamed: 0,Column,Missing_Count,Percentage
6,NICKNAME,615591,92.067787
8,COMMENT,558939,83.594914
7,START_POSITION,412863,61.747788
28,PLUS_MINUS,133351,19.943975
10,FGM,109690,16.405236
20,DREB,109690,16.405236
27,PTS,109690,16.405236
26,PF,109690,16.405236
25,TO,109690,16.405236
24,BLK,109690,16.405236



üí° NOTES:
   - COMMENT: Expected to be NULL (only filled when player didn't play)
   - NICKNAME: Some players don't have nicknames
   - START_POSITION: NULL for bench players (expected!)
   - MIN: NULL for DNP (Did Not Play) players
üîç PLAYER PARTICIPATION ANALYSIS

Players listed but Did Not Play (DNP): 109,870
Percentage: 16.4%

Players who actually played: 558,758
Percentage: 83.6%

Players listed per game:
   Average: 25.2
   Min: 18
   Max: 45

üí° Typical: 24-28 players (2 teams √ó 12-14 roster spots)

‚ö†Ô∏è  19 games have <20 players listed (unusual!)
Loading remaining tables...
‚úÖ teams.csv: 30 rows √ó 14 columns


Unnamed: 0,LEAGUE_ID,TEAM_ID,MIN_YEAR,MAX_YEAR,ABBREVIATION,NICKNAME,YEARFOUNDED,CITY,ARENA,ARENACAPACITY,OWNER,GENERALMANAGER,HEADCOACH,DLEAGUEAFFILIATION
0,0,1610612737,1949,2019,ATL,Hawks,1949,Atlanta,State Farm Arena,18729.0,Tony Ressler,Travis Schlenk,Lloyd Pierce,Erie Bayhawks
1,0,1610612738,1946,2019,BOS,Celtics,1946,Boston,TD Garden,18624.0,Wyc Grousbeck,Danny Ainge,Brad Stevens,Maine Red Claws
2,0,1610612740,2002,2019,NOP,Pelicans,2002,New Orleans,Smoothie King Center,,Tom Benson,Trajan Langdon,Alvin Gentry,No Affiliate



----------------------------------------------------------------------

‚úÖ ranking.csv: 210,342 rows √ó 13 columns


Unnamed: 0,TEAM_ID,LEAGUE_ID,SEASON_ID,STANDINGSDATE,CONFERENCE,TEAM,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,RETURNTOPLAY
0,1610612743,0,22022,2022-12-22,West,Denver,30,19,11,0.633,10-3,9-8,
1,1610612763,0,22022,2022-12-22,West,Memphis,30,19,11,0.633,13-2,6-9,
2,1610612740,0,22022,2022-12-22,West,New Orleans,31,19,12,0.613,13-4,6-8,



----------------------------------------------------------------------

‚úÖ players.csv: 3,922 rows √ó 8 columns


Unnamed: 0.1,Unnamed: 0,Player,height,weight,collage,born,birth_city,birth_state
0,0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,,
1,1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,Yorktown,Indiana
2,2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,,


Loading: Seasons_Stats.csv (Dataset 2 - Player advanced stats)
‚úÖ Loaded successfully!

üìä Dimensions: 24,691 rows √ó 53 columns

üìÖ Years covered: 1950.0 to 2017.0

üìã Advanced metrics available:
   PER, TS%, USG%, OWS, DWS, WS, VORP, BPM

üîç Sample data:


Unnamed: 0,Year,Player,Tm,PTS,AST,TRB
0,1950.0,Curly Armstrong,FTW,458.0,176.0,
1,1950.0,Cliff Barker,INO,279.0,109.0,
2,1950.0,Leo Barnhorst,CHS,438.0,140.0,


Loading: BETTING DATA (Dataset 3)
‚úÖ nba_betting_money_line.csv: 125,286 rows √ó 7 columns
   Sample:


Unnamed: 0,game_id,book_name,book_id,team_id,a_team_id,price1,price2
0,41100314,Pinnacle Sports,238,1610612759,1610612760,165.0,-183.0
1,41100314,5Dimes,19,1610612759,1610612760,165.0,-175.0



‚úÖ nba_betting_spread.csv: 131,690 rows √ó 9 columns
   Sample:


Unnamed: 0,game_id,book_name,book_id,team_id,a_team_id,spread1,spread2,price1,price2
0,21000358,Pinnacle Sports,238,1610612749,1610612742,7.5,-7.5,-106.0,-104.0
1,21000358,5Dimes,19,1610612749,1610612742,7.5,-7.5,-110.0,-110.0



‚úÖ nba_betting_totals.csv: 131,386 rows √ó 9 columns
   Sample:


Unnamed: 0,game_id,book_name,book_id,team_id,a_team_id,total1,total2,price1,price2
0,21100131,Pinnacle Sports,238,1610612740,1610612743,192.0,192.0,-105.0,-105.0
1,21100131,5Dimes,19,1610612740,1610612743,192.0,192.0,-105.0,-105.0



‚úÖ nba_games_all.csv: 125,624 rows √ó 32 columns
   Sample:


Unnamed: 0,game_id,game_date,matchup,team_id,is_home,wl,w,l,w_pct,min,fgm,fga,fg_pct,fg3m,fg3a,fg3_pct,ftm,fta,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,pts,a_team_id,season_year,season_type,season
0,20800741,2009-02-06,SAC vs. UTA,1610612762,f,W,29.0,22.0,0.569,240,39.0,74.0,0.527,4.0,13.0,0.308,29.0,37.0,1.0,8.0,36.0,44,19.0,5.0,4.0,18.0,26.0,111,1610612758,2008,Regular Season,2008-09
1,20800701,2009-01-31,POR vs. UTA,1610612762,f,L,26.0,22.0,0.542,240,37.0,68.0,0.544,5.0,20.0,0.25,29.0,37.0,1.0,7.0,20.0,27,17.0,6.0,0.0,15.0,22.0,108,1610612757,2008,Regular Season,2008-09




‚úÖ Loaded 4 betting data files!
üîó TABLE RELATIONSHIPS

1Ô∏è‚É£ games ‚Üí teams:
   Teams in games: 30
   Teams in teams table: 30
   ‚úÖ All teams accounted for!

2Ô∏è‚É£ games_details ‚Üí games:
   Unique games in games: 26,622
   Unique games in games_details: 26,523
   ‚ö†Ô∏è  Games without player details: 99

   üìä JOIN Success Rate: 99.6%

3Ô∏è‚É£ betting_data ‚Üí games:
   nba_betting_money_line.csv: 100.0% of betting games found in main games table
   nba_betting_spread.csv: 100.0% of betting games found in main games table
   nba_betting_totals.csv: 100.0% of betting games found in main games table
   nba_games_all.csv: 32.9% of betting games found in main games table
üìã COMPLETE DATA INVENTORY


Unnamed: 0,Table,Description,Rows,Columns,Status
0,games,Game results with scores,26651,22,‚úÖ
1,games_details,Player stats per game,668628,29,‚úÖ
2,teams,Team information,30,14,‚úÖ
3,ranking,Team standings,210342,13,‚úÖ
4,players,Player-season mapping,3922,8,‚úÖ
5,seasons_stats,Player advanced stats,24691,53,‚úÖ



üí∞ Betting Data Files: 4
   ‚úÖ nba_betting_money_line.csv: 125,286 rows
   ‚úÖ nba_betting_spread.csv: 131,690 rows
   ‚úÖ nba_betting_totals.csv: 131,386 rows
   ‚úÖ nba_games_all.csv: 125,624 rows
üéØ KEY INSIGHTS & ISSUES

üìä INSIGHTS:
   ‚úÖ 26,651 NBA games from 2003-10-05 to 2022-12-22
   ‚úÖ 2,687 unique players tracked
   ‚úÖ 30 teams in dataset
   ‚úÖ Betting odds available for analysis

‚ö†Ô∏è  ISSUES TO ADDRESS:
   ‚ö†Ô∏è  1,188 missing values in games table
   ‚ö†Ô∏è  109,690 player-game records with no minutes (DNP)
   ‚ö†Ô∏è  29 duplicate game records

‚úÖ PHASE 1: DATA QUALITY ASSESSMENT COMPLETE!

üìä WHAT YOU HAVE:
   ‚Ä¢ 26,651 NBA games
   ‚Ä¢ 668,628 player-game records
   ‚Ä¢ 30 teams
   ‚Ä¢ 24,691 player-season statistics
   ‚Ä¢ 4 betting data files

üéØ NEXT PHASE: Data Cleaning
   We'll fix:
   1. Missing values (decide: drop or fill?)
   2. Duplicate records
   3. Data format issues
   4. Merge betting odds with games
   5. Create master dataset for mo

In [None]:
import os
print(os.getcwd())

import os
os.makedirs("data/raw", exist_ok=True)
print("Folder created successfully")


import os
print(os.path.exists("data/raw"))
