By: Denver Chernin <br>
Goal: Determine if playing in college creates a better MLB player then classify new players as college players or not based on their statistics (wOBA, OPS, etc.). These factors have yet to be determined

In [1]:
#Installing necessary package for data access
import sys
!conda install --yes --prefix {sys.prefix} pandasql

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - pandasql


The following NEW packages will be INSTALLED:

  pandasql           conda-forge/noarch::pandasql-0.7.3-pyhd8ed1ab_0

The following packages will be UPDATED:

  ca-certificates                      2021.5.30-ha878542_0 --> 2021.10.8-ha878542_0
  certifi                          2021.5.30-py39hf3d152e_0 --> 2021.10.8-py39hf3d152e_0
  openssl                                 1.1.1k-h7f98852_1 --> 1.1.1l-h7f98852_0


Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [2]:
#Importing necessary libraries
import pandas as p #dataframes to hold tables
from pandasql import sqldf #sql to dataframe conversion for data access

In [3]:
#Lambda function to make sql queries of data frames cleaner
pysqldf = lambda q: sqldf(q, globals())

In [4]:
#Reading in the college data
df_college = p.read_csv('../data/CollegePlaying.csv')

#Reading in the batting data of all players
df_batting = p.read_csv('../data/Batting.csv')

#Reading in the pitching data for all pitchers
df_pitchers = p.read_csv('../data/Pitching.csv')

#Reading in the people data (allows me to translate playerIDs into names (first and last))
df_people = p.read_csv('../data/People.csv')

#Reading in the wOBA (weighted on base average) weights for every year to use in calculations 
#as the linear equations for wOBA stays the same every year, but the weights do not
df_weights = p.read_csv('../data/wOBA_Weights.csv')

In [5]:
#Grabbing the unique playerIDs that exist in the df_college table (AKA obtaining what players played in college)
df_college_playerids = pysqldf("SELECT DISTINCT playerID from df_college group by playerID")

In [6]:
df_college_playerids.head(5)

Unnamed: 0,playerID
0,aardsda01
1,abadan01
2,abbeybe01
3,abbotje01
4,abbotji01


In [7]:
#Grabbing all of the batting statistics for each player that went to college
df_batters_college = pysqldf("SELECT * FROM df_batting WHERE df_batting.playerID IN (SELECT * FROM df_college_playerids)")

In [8]:
df_batters_college.head(5)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,bellast01,1871,1,TRO,,29,128,26,32,3,...,23.0,4.0,4.0,9,2.0,,,,,2.0
1,mackde01,1871,1,RC1,,25,122,34,30,7,...,17.0,12.0,0.0,8,7.0,,,,,0.0
2,bellast01,1872,1,TRO,,23,115,22,30,4,...,17.0,1.0,0.0,0,0.0,,,,,2.0
3,fletcge01,1872,1,BR1,,2,8,1,2,0,...,1.0,0.0,0.0,0,1.0,,,,,0.0
4,mackde01,1872,1,PH1,,47,205,68,59,9,...,34.0,9.0,5.0,23,9.0,,,,,1.0


In [9]:
#Grabbing all of the batting statistics for each player that DIDNT go to college
df_batters_no_college = pysqldf("SELECT * FROM df_batting WHERE df_batting.playerID NOT IN (SELECT * FROM df_college_playerids)")

In [10]:
df_batters_no_college.head(5)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0


In [34]:
over50_no_college_count = pysqldf("SELECT COUNT(*) as C from df_batters_no_college where G >= 50")
print("Number of games played >= 50, no college: " + str(over50_no_college_count['C'].iloc[0]))

Number of games played >= 50, no college: 26856


In [35]:
over50_college_count = pysqldf("SELECT COUNT(*) as C from df_batters_college where G >= 50")
print("Number of games played >= 50, college: " + str(over50_college_count['C'].iloc[0]))

Number of games played >= 50, college: 13889
