### Set up the Notebook

In [None]:
# Import Necessary Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Import Data Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


- Note: I've picked up this dataset /kaggle/input/draft-picks-from-each-university, it has 10 csv files from each University, I'll merge the 10 files all togheter.

In [None]:
# List of file paths
file_paths = [
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_Duke.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_Kentucky.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_Okstate.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_arizona.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_georgia.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_kansas.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_lsu.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_memphis.csv",
    "/kaggle/input/draft-picks-from-each-university/Draft_Picks_From_washington.csv"
]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through the file paths and read each CSV file into a DataFrame
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Concatenate all DataFrames into one
data = pd.concat(dataframes, ignore_index=True)

# Now, 'data' contains the merged data from all the CSV files


#### Initial Inspection of the DataFrame

In [None]:
# Display the first 10 rows
print("First 10 rows")
data.head(10)

In [None]:
# Check for the columns
data.columns

In [None]:
# Display the last few rows
data.tail()

In [None]:
# Check the Shape of the data
num_rows, num_columns = data.shape
print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_columns}")

In [None]:
# Display the information of the data
data.info()

#### Summary Statistics

In [None]:
# Include non-numeric columns in the summary statistics.
data.describe(include="all")

In [None]:
# Generate Summary Statistics for Numerical Columns
data.describe()

#### Calculate the percentage of Missing values in the DataFrame

In [None]:
# Calculate the total number of missing values in each column
missing_values = data.isnull().sum()

# Calculate the total number of values in each column
total_values = data.shape[0]

# Calculate the percentage of missing values for each column
percentage_missing = (missing_values / total_values) * 100

# Create a DataFrame to display the results
missing_data_summary = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Values': missing_values.values,
    'Percentage Missing': percentage_missing.values
})

# Sort the DataFrame by the percentage of missing values (descending order)
missing_data_summary = missing_data_summary.sort_values(by='Percentage Missing', ascending=False)

# Display the summary
print(missing_data_summary)


#### Total Percentage of the DataFrame

In [None]:
# Calculate the total number of missing values in the entire DataFrame
total_missing = data.isnull().sum().sum()

# Calculate the total number of values in the DataFrame
total_values = data.size

# Calculate the total percentage of missing values
total_percentage_missing = (total_missing / total_values) * 100

# Display the total percentage of missing values
print(f"Total Percentage of Missing Values: {total_percentage_missing:.2f}%")

#### Count Missing values

In [None]:
data.isnull().sum()

In [None]:
# Get the total of Missing values
data.isnull().sum().sum()

In [None]:
# Create a DataFrame of Boolean values indicating non-missing values.
data.notnull()

#### Data Cleaning

In [None]:
# Replace all missing values in the entire DataFrame with zeros
data.fillna(0, inplace=True)

#### Unique Values

In [None]:
# Check the Unique values of the column 'Player'
data['Player'].value_counts()

In [None]:
# Calculate the correlation between two specific columns.
data['MP'].corr(data['FGA'])

In [None]:
# Histograms and Plots:
data['FGA'].hist()

In [None]:
# Create a box plot to visualize the distribution and detect outliers.
data['PTS'].plot(kind='box')

#### Duplicates

In [None]:
# Identify duplicated rows.
data.duplicated()

In [None]:
# Show duplicated rows
data[data.duplicated()]

#### Career Statistics:

- Who has the highest career points (PTS)?
 

In [None]:
max_career_pts_player = data[data['PTS'] == data['PTS'].max()]['Player'].values[0]
max_career_pts = data['PTS'].max()
print(f"{max_career_pts_player} has the highest career points with {max_career_pts} points.")

#### Efficiency Metrics:


- What is the average field goal percentage (FG%) for all players?


In [None]:
avg_fg_percentage = data['FG%'].mean()
print(f"Average FG% for all players: {avg_fg_percentage:.2%}")

#### Top Performers:


- Who are the top 5 players with the highest points per game (PTS/G)?

In [None]:
top_pts_per_game_players = data.nlargest(5, 'PTS/G')[['Player', 'PTS/G']]
print("Top 5 Players by PTS/G:")
print(top_pts_per_game_players)

#### Distribution of Statistics:


- What is the distribution of points (PTS) across all players?


In [None]:
import matplotlib.pyplot as plt
plt.hist(data['PTS'], bins=20)
plt.xlabel('PTS')
plt.ylabel('Frequency')
plt.title('Distribution of Points (PTS)')
plt.show()

#### Player Comparison:


- Compare the career statistics of two specific players (replace 'Player1' and 'Player2' with actual player names):

In [None]:
player1_stats = data[data['Player'] == 'Elton Brand']
player2_stats = data[data['Player'] == 'Grant Hill *']

#### Player Career Duration:

- Calculate the career duration for each player (From - To) and analyze the distribution.

In [None]:
data['Career Duration'] = data['To'] - data['From']
career_duration_distribution = data['Career Duration'].value_counts()
print(career_duration_distribution)

#### Data Visualization!

In [None]:
 # Interactive scatter plot to visualize the relationship between 'PTS' and 'AST'
fig1 = px.scatter(data, x='PTS', y='AST', title='Scatter Plot of PTS vs. AST')
fig1.show()

In [None]:
# Interactive histogram of 'PTS'
fig2 = px.histogram(data, x='PTS', title='Histogram of PTS')
fig2.show()

In [None]:
# Interactive scatter matrix to visualize relationships between multiple numeric columns
fig5 = px.scatter_matrix(data, dimensions=['PTS', 'AST', 'TRB', 'STL'], title='Scatter Matrix')
fig5.show()

In [None]:
# Interactive scatter plot matrix for selected statistics
fig7 = px.scatter_matrix(data, dimensions=['PTS', 'AST', 'TRB', 'STL'], color='Year', title='Scatter Matrix by Year')
fig7.show()

In [None]:
# Interactive bar chart of the top 10 players with the most points (PTS)
import plotly.express as px
top_10_pts_players = data.nlargest(10, 'PTS')[['Player', 'PTS']]
fig8 = px.bar(top_10_pts_players, x='Player', y='PTS', title='Top 10 Players by PTS', color='Player')
fig8.show()


In [None]:
import plotly.express as px
#  Interactive bar chart of the top 10 players with the most assists (AST) with different colors
top_10_ast_players = data.nlargest(10, 'AST')[['Player', 'AST']]
fig13 = px.bar(top_10_ast_players, x='Player', y='AST', title='Top 10 Players by AST', color='Player')
fig13.show()
