# Load Dataset

In [None]:
import os
import pandas as pd

os.chdir('/content/drive/MyDrive/Colab/Datasets/')
df = pd.read_csv('nba.csv')

## Data Preprocessing

In [None]:
# Check if there're missing values
# df.isnull().sum()

# Remove Missing Values
df = df.dropna(axis=0, how='any')

# Remove Duplicates
df = df.drop_duplicates()

# Split Data into Training(80) and Testing(20)
df_train = df.sample(frac=0.8, random_state=10)
df_test = df.drop(df_train.index)

# df_train.head(5)
df_test.head(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
11,Isaiah Thomas,Boston Celtics,4.0,PG,27.0,5-9,185.0,Washington,6912869.0
14,Tyler Zeller,Boston Celtics,44.0,C,26.0,7-0,253.0,North Carolina,2616975.0
17,Wayne Ellington,Brooklyn Nets,21.0,SG,28.0,6-4,200.0,North Carolina,1500000.0
19,Jarrett Jack,Brooklyn Nets,2.0,PG,32.0,6-3,200.0,Georgia Tech,6300000.0


### Statistical Description


In [None]:
# Calculate the Description for Training & Test Data
print(f"{df_train.describe()}\n {df_test.describe()}")

           Number         Age      Weight        Salary
count  291.000000  291.000000  291.000000  2.910000e+02
mean    17.054983   26.635739  219.883162  4.705779e+06
std     15.540506    4.205656   24.722880  5.172349e+06
min      0.000000   19.000000  165.000000  5.572200e+04
25%      5.000000   24.000000  200.000000  1.000000e+06
50%     12.000000   26.000000  220.000000  2.637720e+06
75%     25.000000   29.000000  240.000000  6.269338e+06
max     99.000000   40.000000  279.000000  2.287500e+07
           Number        Age      Weight        Salary
count  73.000000  73.000000   73.000000  7.300000e+01
mean   15.931507  26.534247  219.397260  4.279611e+06
std    12.638487   4.371887   25.239816  4.924068e+06
min     0.000000  20.000000  161.000000  1.111960e+05
25%     6.000000  23.000000  200.000000  1.074169e+06
50%    13.000000  26.000000  219.000000  2.239800e+06
75%    21.000000  29.000000  239.000000  5.675000e+06
max    50.000000  40.000000  275.000000  2.009306e+07




*   Count: This shows there're 291 and 73 rows on the dataset for the training & test data respectively.
*   mean: Average value for all columns


*   Std: This shows the standard deviation, measures the dispersion of data points around the mean.
*   Min: Displays the minimum value across each column


*   25%, 50% and 70%: This displays the quatiles in your data set and 50% is the median, while 25% & 70% are the 1st and 3rd quatiles respectively.
*   Max: Shows the maximum value across each column







## Correlation Analysis

In [None]:
# Find Teams
df_boston = df_train.loc[df_train['Team'] == 'Boston Celtics']['Salary'] # Find Boston Celtics
df_brooklyn = df_train.loc[df_train['Team'] == 'Brooklyn Nets']['Salary'] # Find Brooklyn Nets
df_newyork = df_train.loc[df_train['Team'] == 'New York Knicks']['Salary'] # Find New York Knicks

# Rename Columns
df_boston_salaries = pd.DataFrame({"Boston Salaries" : df_boston})
df_brooklyn_salaries = pd.DataFrame({"Brooklyn Salaries" : df_brooklyn})
df_newyork_salaries = pd.DataFrame({"New York Salaries" : df_newyork})

# Reset Indexes
df_boston_salaries.reset_index(drop=True, inplace=True)
df_brooklyn_salaries.reset_index(drop=True, inplace=True)
df_newyork_salaries.reset_index(drop=True, inplace=True)

# Concatenate Series
df_salaries = pd.concat([df_boston_salaries, df_brooklyn_salaries, df_newyork_salaries], axis=1)

# Drop Duplicates if any
df_salaries.drop_duplicates(inplace=True)

# Correlation Analysis
print("\n Correlation Analysis between the three teams salaries")
df_salaries.corr()


 Correlation Analysis between the three teams salaries


Unnamed: 0,Boston Salaries,Brooklyn Salaries,New York Salaries
Boston Salaries,1.0,0.548442,0.733512
Brooklyn Salaries,0.548442,1.0,0.945886
New York Salaries,0.733512,0.945886,1.0
