In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hittingstats/2018_MLB_Hitting_Stats.txt
/kaggle/input/pitchingstats/2018_MLB_Pitching_Stats_II.txt


In [2]:
import math
import matplotlib.pyplot as plt
import seaborn as sb

path1 = '/kaggle/input/hittingstats/2018_MLB_Hitting_Stats.txt'
path2 = '/kaggle/input/pitchingstats/2018_MLB_Pitching_Stats_II.txt'

hd_df = pd.read_csv(path1, delim_whitespace=True)
pd_df = pd.read_csv(path2, delim_whitespace=True)

x = [1.0,2.0,4.0,6.0,5.0,6.0,9.0,8.0,11.0,12.0]
y = [14.0,10.0,12.0,9.0,8.0,6.0,4.0,3.0,3.0,10.0]

In [3]:
def corcoeff(xd, yd):
    sigma1 = sigma_xy(xd, yd) * len(xd)
    sigma2 = sum(xd)*sum(yd)
    sigma3 = len(xd) * sum([val**2 for val in xd])
    sigma4 = sum(xd)**2
    sigma5 = len(yd) * sum([val**2 for val in yd])
    sigma6 = sum(yd)**2
    top = (sigma1 - sigma2)
    bottom = (math.sqrt(sigma3-sigma4))*(math.sqrt(sigma5-sigma6))
    return top/bottom
    

def sigma_xy(xd, yd):
    nlist = []
    for i in range(len(xd)):
        nlist.append((xd[i]*yd[i]))
    return sum(nlist)

def least_sqrs(xd, yd):
    matrix1 = [[sum(val**2 for val in xd), sum(xd)], [sum(xd), len(xd)]]
    matrix2 = [sigma_xy(xd, yd), sum(yd)]
    array1 = np.array(matrix1)
    array2 = np.array(matrix2)
    invarray1 = np.linalg.inv(array1)
    solution = np.dot(invarray1, array2)
    return solution

def scatter_plot(data1, data2, slope, y_int):
    y_vals = []
    x_data = [min(data1), max(data1)]
    for val in range(2):
        ans = (slope * x_data[val]) + y_int
        y_vals.append(ans)
    plt.plot(x_data, y_vals, '-r')
    plt.scatter(data1, data2)
    plt.title('SCATTER PLOT')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.text(x_data[1], y_vals[1], f'Y={round(slope)}*X+{round(y_int)}', color='g')
    plt.show()
    
    
def mean(data):
    total = sum(data)
    m = total/len(data)
    return m
    
def median(data):
    data.sort()
    if len(data)%2 == 0:
        m = (data[len(data)//2] + data[len(data)//2-1])/2
    else:
        m = data[len(data)//2]
    return m

def variance(data):
    new_list = [(val-mean(data))**2 for val in data]
    v = mean(new_list)
    return v

def stand_dev(data):
    v = variance(data)
    s = math.sqrt(v)
    return s

def residuals(xd, yd, n=2):
    mb = least_sqrs(xd, yd)
    ys = [val*mb[0]+mb[1] for val in xd]
    r = [yd[n]-ys[n] for n in range(len(yd))]
    mr = mean(r)
    stdr = stand_dev(r)
    return r, mr, stdr

def scatter_plot_er(data1, data2, slope, y_int, std, n=2):
    y_vals = []
    e1 = []
    e2 = []
    x_data = [min(data1), max(data1)]
    for val in range(2):
        ans = (slope * x_data[val]) + y_int
        y_vals.append(ans)
    for val in range(2):
        ans = (slope * x_data[val]) + y_int +(n*std)
        e1.append(ans)
    for val in range(2):
        ans = (slope * x_data[val]) + y_int -(n*std)
        e2.append(ans)
    plt.plot(x_data, y_vals, '-r')
    plt.plot(x_data, e1, '--r')
    plt.plot(x_data, e2, '--r')
    plt.scatter(data1, data2)
    plt.title('SCATTER PLOT')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.text(x_data[1], y_vals[1], f'Y={round(slope)}*X+{round(y_int)}', color='g')
    plt.show()
    
def remove_point(xd, yd, slope, y_int, std, n=2):
    for val in range(len(xd)):
        y1 = (slope * xd[val]) + y_int +(n*std)
        y2 = (slope * xd[val]) + y_int -(n*std)
        if yd[val] > y1 or yd[val] < y2:
            yd.pop(val)
            xd.pop(val)
    return xd, yd

In [4]:
def rmse(res):
    new_list = [val**2 for val in res]
    s = sum(new_list)
    i = s/len(new_list)
    return math.sqrt(i)

tres, tmres, tstdres = residuals(x, y)
rmset = rmse(tres)


In [5]:
# hd_df
# hitting coreelation coefficients
obp_list = hd_df['OBP'].values.tolist()
slg_list = hd_df[ 'SLG'].values.tolist()
avg_list = hd_df['AVG'].values.tolist()
rbi_list = hd_df['RBI'].values.tolist()
wpct_list = hd_df['WPCT'].values.tolist()

wpct_obp = corcoeff(wpct_list, obp_list)
wpct_slg = corcoeff(wpct_list, slg_list)
wpct_avg = corcoeff(wpct_list, avg_list)
wpct_rbi = corcoeff(wpct_list, rbi_list)

# Pitching correlation coefficients
era_list = pd_df['ERA'].values.tolist()
so_list = pd_df['SO'].values.tolist()
er_list = pd_df['ER'].values.tolist()
hr_list = pd_df['HR'].values.tolist()
pwpct_list = pd_df['WPCT'].values.tolist()

wpct_era = corcoeff(pwpct_list, era_list)
wpct_so = corcoeff(pwpct_list, so_list)
wpct_er = corcoeff(pwpct_list, er_list)
wpct_hr = corcoeff(pwpct_list, hr_list)
# new data frame for the correlations

coeeff_dict = {'H Name': ['OBP', 'SLG', 'AVG', "RBI"],
               'hitting': [wpct_obp, wpct_slg, wpct_avg, wpct_rbi],
               'P Name': ['ERA', 'SO', 'ER', 'HR'],
               'Pitching': [wpct_era, wpct_so, wpct_er, wpct_hr]
           }
ndf = pd.DataFrame(coeeff_dict)
ndf


Unnamed: 0,H Name,hitting,P Name,Pitching
0,OBP,0.798343,ERA,-0.875797
1,SLG,0.749826,SO,0.736197
2,AVG,0.68859,ER,-0.874808
3,RBI,0.823533,HR,-0.656634
