In [3]:
import pandas as pd
import os

def clean_seasons_stats_csv(input_filepath, output_filepath):
    """
    Cleans the Seasons_Stats.csv file by removing unnecessary columns
    and converting columns with floating-point values to integers.
    
    Args:
        input_filepath (str): The path to the original Seasons_Stats.csv file.
        output_filepath (str): The path where the cleaned CSV will be saved.
    """
    try:
        print(f"Loading data from: {input_filepath}")
        
        # Read the CSV file into a pandas DataFrame.
        df = pd.read_csv(input_filepath)
        
        print("Original columns:", df.columns.tolist())
        
        # Drop the unnecessary placeholder columns.
        columns_to_drop = ['blanl', 'blank2']
        existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]
        
        if existing_columns_to_drop:
            df.drop(existing_columns_to_drop, axis=1, inplace=True)
            print(f"Dropped columns: {existing_columns_to_drop}")
        else:
            print("No columns to drop. The file may already be clean.")
            
        # List of columns that should be integers.
        # This fixes the "invalid input syntax for type integer: '1950.0'" error.
        int_columns = [
            'row_id', 'Year', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', 
            '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 
            'BLK', 'TOV', 'PF', 'PTS', 'player_index', 'year_start', 
            'year_end', 'weight', 'born'
        ]
        
        # Convert specified columns to integers. We use 'Int64' to handle NaN values.
        for col in int_columns:
            if col in df.columns:
                # Replace any non-numeric values with NaN before converting.
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
                print(f"Converted column '{col}' to integer type.")
        
        # Display the new, clean columns.
        print("Cleaned columns:", df.columns.tolist())
        
        # Save the cleaned DataFrame to a new CSV file.
        df.to_csv(output_filepath, index=False)
        print(f"Successfully saved cleaned data to: {output_filepath}")

    except FileNotFoundError:
        print(f"Error: The file at {input_filepath} was not found.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# --- Main script execution ---

# The full path to your CSV file, as provided in your query.
input_file = r'C:\Users\sc\Desktop\SQL Projects\Project - 5\Seasons_Stats.csv'

# Define the name of the new, cleaned CSV file.
output_file = r'C:\Users\sc\Desktop\SQL Projects\Project - 5\Seasons_Stats_cleaned.csv'

# Call the function to perform the cleaning.
clean_seasons_stats_csv(input_file, output_file)


Loading data from: C:\Users\sc\Desktop\SQL Projects\Project - 5\Seasons_Stats.csv
Original columns: ['Unnamed: 0', 'Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
Dropped columns: ['blanl', 'blank2']
Converted column 'Year' to integer type.
Converted column 'Age' to integer type.
Converted column 'G' to integer type.
Converted column 'GS' to integer type.
Converted column 'MP' to integer type.
Converted column 'FG' to integer type.
Converted column 'FGA' to integer type.
Converted column '3P' to integer type.
Converted column '3PA' to integer type.
Converted column '2P' to integer type.
Converted column '2PA' to integer type.
Converted column 'FT' to integer type