In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Read in Salary data

In [None]:
df1 = pd.read_csv("C:/Users/arist/Documents/CS6830/baseballdatabank-2022.2/baseballdatabank-2022.2/contrib/Salaries.csv")
df1

## Read in Batting data

In [None]:
df2 = pd.read_csv("C:/Users/arist/Documents/CS6830/baseballdatabank-2022.2/baseballdatabank-2022.2/core/Batting.csv")
df2

## Merge data on playerID

In [None]:
df3 = pd.merge(df1, df2, on="playerID")
df3

## limit to recent years (2012-2016 based on salary yearID)

In [None]:
temp = df3[df3.yearID_x > 2011]
temp.columns

## Limit dataset to more than 10 games played

In [None]:
temp = temp[temp.G > 10]

## visualize salary distributions

In [None]:
sns.histplot(x='salary', data=temp, bins=15)

## Select features for regressions

In [None]:
feature_cols = ['G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP']

## Setup for regressions

In [None]:
X = temp.loc[:, feature_cols]
# temp[["salary"]].values
# # y = temp.loc[temp.salary]
y = temp.salary
print(y)


## Decision tree regressor

In [None]:
from sklearn import tree
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X, y)

In [None]:
tree.plot_tree(clf)

In [None]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=feature_cols,
                                max_depth = 3,
                                
                                # class_names=['index','Salary'],  
                                filled=True, rounded=True,  
                                special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render("salary_batting") 

## Linear regression

In [None]:
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)
importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
fig, ax = plt.subplots()

p1 = ax.bar([x for x in range(len(importance))], importance)
ax.set_ylabel('Importance')
ax.set_title('Linear regression of Salary on batting stats')
ax.set_xticks([x for x in range(len(importance))], labels=feature_cols)
# ax.set(label=feature_cols, ylabel='Importance')
fig.show()

In [None]:
# from sklearn.linear_model import LogisticRegression

# model = LogisticRegression()
# # fit the model
# model.fit(X, y)
# # get importance
# importance = model.coef_[0]
# # summarize feature importance
# for i,v in enumerate(importance):
#     print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

## Random Forest regression

In [None]:

from sklearn.ensemble import RandomForestRegressor

# define the model
model = RandomForestRegressor()
# fit the model
model.fit(X, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
fig, ax = plt.subplots()

p1 = ax.bar([x for x in range(len(importance))], importance)
ax.set_ylabel('Importance')
ax.set_title('Random Forest regression of Salary on batting stats')
ax.set_xticks([x for x in range(len(importance))], labels=feature_cols)
# ax.set(label=feature_cols, ylabel='Importance')
fig.show()

In [None]:
people = pd.read_csv("C:/Users/arist/Documents/CS6830/baseballdatabank-2022.2/baseballdatabank-2022.2/core/BattingPost.csv")
people.columns

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
temp[temp.isna().any(axis=1)]
temp = temp.dropna()

## Scatterplot distribution of Salary vs. RBI's

In [None]:
sns.scatterplot(x='RBI', y='salary', data=temp.sample(frac=0.1), s=4)

## Lineplot distribution of Salary vs. RBI's

In [None]:
sns.lineplot(x='RBI', y='salary', data=temp)

## Lineplot distribution of Salary vs. Games played

In [None]:
sns.lineplot(x='G', y='salary', data=temp)

## Lineplot distribution of Salary vs. Hits

In [None]:
sns.lineplot(x='H', y='salary', data=temp)

## limit data frame to players with only more than 10 RBI's in a season

In [None]:
temp = temp[temp.RBI > 10]

## Histogram of RBI's after limiting

In [None]:
import matplotlib as mpl

sns.set_theme(style="ticks")

f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)

sns.histplot(
    temp,
    x="RBI",
    multiple="stack",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
)
ax.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter())
# ax.set_xticks([500, 1000, 2000, 5000, 10000])

## Lineplot of salary vs RBI's after limiting

In [None]:
sns.lineplot(x='RBI', y='salary', data=temp)

## Linear Regression of Salary on RBI's, for prediction of salary

In [None]:
feature_name = "RBI"
target_name = "salary"
data, target = temp[[feature_name]], temp[target_name]

In [None]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(data, target)

In [None]:
RBI_salary_slope = linear_regression.coef_[0]
RBI_salary_slope

In [None]:
intercept = linear_regression.intercept_
intercept

In [None]:
import numpy as np

RBI_range = np.linspace(data.min(), data.max(), num=300)
predicted_salary = (
    RBI_salary_slope * RBI_range + intercept)

In [None]:
# sns.scatterplot(x=data[feature_name], y=target, color="black", alpha=0.5)
sns.scatterplot(x='RBI', y='salary', data=temp.sample(frac=0.1), s=4, alpha=0.5)
plt.plot(RBI_range, predicted_salary)
_ = plt.title("Linear Regression of salary by RBI")

## linear regression of salary on hits

In [None]:
feature_name = "H"
target_name = "salary"
data, target = temp[[feature_name]], temp[target_name]
linear_regression = LinearRegression()
linear_regression.fit(data, target)
slope = linear_regression.coef_[0]
intercept = linear_regression.intercept_
_range = np.linspace(data.min(), data.max(), num=300)
predicted_salary = (slope * _range + intercept)
sns.scatterplot(x='H', y='salary', data=temp.sample(frac=0.1), s=4, alpha=0.5)
plt.plot(_range, predicted_salary)
_ = plt.title("Linear Regression of salary by H")

## linear regression of salary on Homeruns

In [None]:
feature_name = "HR"
target_name = "salary"
data, target = temp[[feature_name]], temp[target_name]
linear_regression = LinearRegression()
linear_regression.fit(data, target)
slope = linear_regression.coef_[0]
intercept = linear_regression.intercept_
_range = np.linspace(data.min(), data.max(), num=300)
predicted_salary = (slope * _range + intercept)
sns.scatterplot(x='HR', y='salary', data=temp.sample(frac=0.1), s=4, alpha=0.5)
plt.plot(_range, predicted_salary)
_ = plt.title("Linear Regression of salary by HR")

## linear regression of salary on walks

In [None]:
feature_name = "IBB"
target_name = "salary"
data, target = temp[[feature_name]], temp[target_name]
linear_regression = LinearRegression()
linear_regression.fit(data, target)
slope = linear_regression.coef_[0]
intercept = linear_regression.intercept_
_range = np.linspace(data.min(), data.max(), num=300)
predicted_salary = (slope * _range + intercept)
sns.scatterplot(x='IBB', y='salary', data=temp.sample(frac=0.1), s=4, alpha=0.5)
plt.plot(_range, predicted_salary)
_ = plt.title("Linear Regression of salary by IBB")

## Scatterplot matrix of batting stats and salary, for viewing correlations with salary and possible covariance

In [None]:
batting_correlations = temp[['salary', 'H', 'RBI', 'IBB', 'HR', 'G']]
g = sns.pairplot(batting_correlations.sample(frac=0.1), plot_kws={'s': 5})
g.fig.suptitle("Scatterplot matrix of salary and batting stats", y=1, fontsize=20)