In [1]:
# Importing the dataset (Kaggle source) from our computer.
import pandas as pd
df = pd.read_csv('nba-stats-salary-rating.csv')

In [2]:
# Dataset pre-processing step. We keep only the varibales which contain important information for our analysis.
df['Salaries'] = df['Salaries'].str.replace('$','')
df['Salaries'] = df['Salaries'].str.replace(',','')
df['Salaries'] = pd.to_numeric(df['Salaries'])
df.drop(['Unnamed: 0', 'Player', 'Tm', 'G', 'GS', 'ORB', 'DRB', 'FG', 'FGA', 'Pos', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'eFG%'], axis = 1, inplace = True)
df.rename(columns = {'MP':'Minutes_Played', 'FG%':'Fieldgoal_Percentage', '3P%':'Threepoint_Percentage', '2P%':'Twopoint_Percentage', 'FT%':'Freethrow_Percentage', 'TRB':'Total_Rebounds', 'AST':'Asists', 'STL':'Steals', 'BLK':'Blocks', 'TOV':'Turnovers', 'PF':'Personal_Fouls', 'PTS':'Points'}, inplace = True)
df['Fieldgoal_Percentage'] = df['Fieldgoal_Percentage']*100
df['Threepoint_Percentage'] = df['Threepoint_Percentage']*100
df['Twopoint_Percentage'] = df['Twopoint_Percentage']*100
df['Freethrow_Percentage'] = df['Freethrow_Percentage']*100
df

Unnamed: 0,Ratings,Salaries,Age,Minutes_Played,Fieldgoal_Percentage,Threepoint_Percentage,Twopoint_Percentage,Freethrow_Percentage,Total_Rebounds,Asists,Steals,Blocks,Turnovers,Personal_Fouls,Points
0,97,37436858,35,34.9,49.8,34.9,57.0,69.7,7.9,10.6,1.2,0.5,4.0,1.8,25.7
1,97,32742000,28,32.2,46.9,36.6,51.0,88.9,7.3,5.0,1.8,0.6,2.7,1.9,26.9
2,96,25842697,25,30.9,54.7,30.6,62.2,63.3,13.7,5.8,1.0,1.0,3.7,3.0,29.6
3,96,38199000,30,36.7,43.5,35.2,53.8,86.1,6.4,7.4,1.7,0.9,4.5,3.4,34.4
4,95,40231758,31,27.8,40.2,24.5,63.6,100.0,5.2,6.6,1.0,0.4,3.2,2.2,20.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,68,522738,24,12.6,40.0,29.9,48.5,93.3,2.1,1.5,0.4,0.1,0.9,1.3,6.1
441,68,79568,25,8.3,36.4,25.0,66.7,66.7,0.4,0.1,0.3,0.0,0.3,1.6,1.8
442,68,79568,22,1.7,,,,,0.3,0.3,0.0,0.0,0.0,0.0,0.0
443,68,898310,19,6.5,40.0,0.0,66.7,100.0,0.0,0.5,0.0,0.0,0.3,0.5,2.5


In [3]:
# We can see the data type of each column.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Ratings                445 non-null    int64  
 1   Salaries               445 non-null    int64  
 2   Age                    445 non-null    int64  
 3   Minutes_Played         445 non-null    float64
 4   Fieldgoal_Percentage   444 non-null    float64
 5   Threepoint_Percentage  425 non-null    float64
 6   Twopoint_Percentage    440 non-null    float64
 7   Freethrow_Percentage   437 non-null    float64
 8   Total_Rebounds         445 non-null    float64
 9   Asists                 445 non-null    float64
 10  Steals                 445 non-null    float64
 11  Blocks                 445 non-null    float64
 12  Turnovers              445 non-null    float64
 13  Personal_Fouls         445 non-null    float64
 14  Points                 445 non-null    float64
dtypes: flo

In [4]:
# Handling missing values step.
df.isnull().sum()

Ratings                   0
Salaries                  0
Age                       0
Minutes_Played            0
Fieldgoal_Percentage      1
Threepoint_Percentage    20
Twopoint_Percentage       5
Freethrow_Percentage      8
Total_Rebounds            0
Asists                    0
Steals                    0
Blocks                    0
Turnovers                 0
Personal_Fouls            0
Points                    0
dtype: int64

In [5]:
# Fill the missing values with the mean of each column.
df['Fieldgoal_Percentage'].fillna(int(df['Fieldgoal_Percentage'].mean()), inplace = True)
df['Threepoint_Percentage'].fillna(int(df['Threepoint_Percentage'].mean()), inplace = True)
df['Twopoint_Percentage'].fillna(int(df['Twopoint_Percentage'].mean()), inplace = True)
df['Freethrow_Percentage'].fillna(int(df['Freethrow_Percentage'].mean()), inplace = True)

In [6]:
# Indeed there are no missing values now.
df.isnull().sum()

Ratings                  0
Salaries                 0
Age                      0
Minutes_Played           0
Fieldgoal_Percentage     0
Threepoint_Percentage    0
Twopoint_Percentage      0
Freethrow_Percentage     0
Total_Rebounds           0
Asists                   0
Steals                   0
Blocks                   0
Turnovers                0
Personal_Fouls           0
Points                   0
dtype: int64

In [7]:
# Splitting our dataset into train and test set in order to perfrom the Machine Learning model.
from sklearn.model_selection import train_test_split
X = df.drop('Salaries', axis = 1)
y = df.iloc[:, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .8)

In [8]:
# We utilize a simple linear regression model
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.predict(X_test)

array([ 1758356.95916533, 12575635.46604124,  -134499.97848286,
        8226981.45375681, -2461982.52230784, 15925591.03062804,
       10224163.76179139,  3366232.94944562,  7581213.85396183,
       12962900.30026697,  7647406.02905659, 13199227.08827509,
        8691173.45685692, 12632269.62385233, -1179618.73801902,
       18672276.99618761, 13622838.0875452 ,  2308649.86353765,
       -2214833.11001204,  5105602.55294898,  5058144.42920403,
        7862697.34501329, 10438549.875103  , -1399107.82405333,
          84174.0825462 , 12275431.66064784, 18533154.69019945,
        -387945.44150293,  8974493.2334785 ,  5809938.09754315,
        8433465.53759043,  -230512.96320127, 11985939.47052383,
        8962654.41636229,  7548710.10186614,  6584987.60455705,
       15009866.32879741,  6076361.14283407,  5730940.77535066,
        8534203.18861747,  6583541.1275066 ,  6170897.52757628,
       14749014.22672088,  8574817.45160988, 13029171.6560102 ,
       -1518652.19372076, 12882252.18637

In [9]:
# Save the model to disk.
import pickle
pickle.dump(reg, open('model.pkl', 'wb'))