# Introduction

This notebook takes DFs from INSERT and INSERT and creates a new df to be modeled with. Part of this process cleans date, counts wins, loses, gets win percentage for each team for each year. creates new features of teams aggrigated stats. Final DF has TEAM, YEAR, TEAM STATS, WIN PERCENTAGE 

# Imports

In [1]:
import pandas as pd

# Load Needed DF's

## `MLB_5_seasons`
The last 5 seasons for MLB, starting at 2022, skipping 2020 due to covid, ending at 2017.

In [2]:
# load
MLB_5_seasons = pd.read_pickle("../pickled_tables/MLB_5_seasons_df.pkl")

# view
MLB_5_seasons

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,Year,Player Name,Position
0,NYM,2,2,1,1,0,0,1,3,0,0,0,0,0.500,0.500,2.000,2.500,2022,Khalil Lee,CF
1,ATL,1,4,0,3,2,0,0,3,0,0,0,0,0.750,0.750,1.250,2.000,2022,Chadwick Tromp,C
2,LAD,4,13,6,6,2,0,1,3,2,7,0,0,0.462,0.563,0.846,1.409,2022,James Outman,LF
3,TOR,8,9,0,6,0,0,0,3,1,1,0,1,0.667,0.700,0.667,1.367,2022,Otto Lopez,SS
4,NYY,47,128,28,39,9,0,15,37,19,35,0,0,0.305,0.412,0.727,1.139,2022,Matt Carpenter,DH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5949,BAL,9,1,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Gabriel Ynoa,P
5950,CWS,22,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Michael Ynoa,P
5951,KC,14,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Chris Young,P
5952,MIA,53,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Brad Ziegler,P


## Load df's with regular season games

In [3]:
# load
games_2017 = pd.read_pickle("../pickled_tables/2017_df.pkl")
games_2018 = pd.read_pickle("../pickled_tables/2018_df.pkl")
games_2019 = pd.read_pickle("../pickled_tables/2019_df.pkl")
games_2021 = pd.read_pickle("../pickled_tables/2021_df.pkl")
games_2022 = pd.read_pickle("../pickled_tables/2022_df.pkl")

# view 1
games_2017

Unnamed: 0,Day,Month,Date,Away,Home,Win,W Score,Lose,L Score
0,Sunday,Apr,2,NYY,TB,TB,7,NYY,3
1,Sunday,Apr,2,SF,AZ,AZ,6,SF,5
2,Sunday,Apr,2,CHC,STL,STL,4,CHC,3
3,Monday,Apr,3,MIA,WSH,WSH,4,MIA,2
4,Monday,Apr,3,ATL,NYM,NYM,6,ATL,0
...,...,...,...,...,...,...,...,...,...
2425,Sunday,Oct,1,CWS,CLE,CLE,3,CWS,1
2426,Sunday,Oct,1,ATL,MIA,ATL,8,MIA,5
2427,Sunday,Oct,1,AZ,KC,AZ,14,KC,2
2428,Sunday,Oct,1,MIL,STL,MIL,6,STL,1


# Functions
To streamline the workflow I put most of the df transformations into functions

## `make_year_string`
This function is simply to make is easier to input year and ensure a valid year is input.

In [4]:
def make_year_string(year):
    if type(year) == int:
        year = str(year)
        
    if year not in ['2022', '2021', '2019', '2018', '2017']:
        print("ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']")
    else:
        return year

### Examples

In [5]:
make_year_string(2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [6]:
make_year_string(2022)

'2022'

## `get_team_to_agg`
("Get team to aggrigate") 

This function selects a team and a year and filters out the players with less than 5 at bats.


In [7]:
def get_team_to_agg(team, year):
    
    # ensure year is string
    year = make_year_string(year)
    
    # ensure year is valid
    if year in ['2022', '2021', '2019', '2018', '2017']:
        entire_team = MLB_5_seasons[MLB_5_seasons['Year']==year][MLB_5_seasons[MLB_5_seasons['Year']==year]['Team'] == team]

        return entire_team[entire_team['At Bats'] > 5]

### Examples

In [8]:
get_team_to_agg('SF', 2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [9]:
get_team_to_agg('SF', 2022)

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,Year,Player Name,Position
30,SF,134,380,57,104,19,3,23,70,42,100,3,2,0.274,0.353,0.521,0.874,2022,Joc Pederson,LF
68,SF,3,8,1,3,0,0,0,0,1,0,0,0,0.375,0.444,0.375,0.819,2022,Austin Dean,LF
78,SF,36,98,14,26,6,1,4,12,10,29,1,1,0.265,0.342,0.469,0.811,2022,Jason Vosler,3B
111,SF,52,156,21,36,6,1,9,24,18,58,0,1,0.231,0.331,0.455,0.786,2022,David Villar,3B
118,SF,125,277,49,73,15,2,7,34,40,89,12,1,0.264,0.366,0.408,0.774,2022,Austin Slater,CF
134,SF,89,266,31,65,13,0,14,42,27,83,0,0,0.244,0.315,0.451,0.766,2022,Evan Longoria,3B
142,SF,115,318,46,79,16,1,12,35,39,122,1,1,0.248,0.34,0.418,0.758,2022,J.D. Davis,3B
210,SF,140,488,71,127,22,2,14,62,33,89,21,6,0.26,0.322,0.4,0.722,2022,Thairo Estrada,2B
229,SF,151,525,72,120,28,1,19,71,59,103,0,0,0.229,0.316,0.394,0.71,2022,Wilmer Flores,2B
262,SF,148,485,73,104,31,2,17,57,61,141,5,1,0.214,0.305,0.392,0.697,2022,Mike Yastrzemski,RF


## `extra_stats`
Create stats for teams by year. Use `get_team_to_agg` to select a team by year and create a dataframe with team stats for that season.

In [11]:
def extra_stats(team, year):
    
    # ensure year is string
    year = make_year_string(year)
    
    # ensure year is valid
    if year in ['2022', '2021', '2019', '2018', '2017']:
        try:
            n = len(get_team_to_agg(team, year)) # number of players with more than 5 at bats
            
            # totals - team as a whole
            ab_sum = get_team_to_agg(team, year)['At Bats'].sum()
            hits_sum = get_team_to_agg(team, year)['Hits'].sum()
            avg_sum = get_team_to_agg(team, year)['Batting Average'].sum()
            rbi_sum = get_team_to_agg(team, year)['Runs Batted In'].sum()
            obp_sum = get_team_to_agg(team, year)['On-Base Percentage'].sum()
            slg_sum = get_team_to_agg(team, year)['Slugging Percentage'].sum()
            ops_sum = get_team_to_agg(team, year)['On-Base Plus Slugging'].sum()
            
            # averages - per player (players with more than 5 At Bats)
            ab_mean = get_team_to_agg(team, year)['At Bats'].mean()
            hits_mean = get_team_to_agg(team, year)['Hits'].mean()
            avg_mean = get_team_to_agg(team, year)['Batting Average'].mean()
            rbi_mean = get_team_to_agg(team, year)['Runs Batted In'].mean()
            obp_mean = get_team_to_agg(team, year)['On-Base Percentage'].mean()
            slg_mean = get_team_to_agg(team, year)['Slugging Percentage'].mean()
            ops_mean = get_team_to_agg(team, year)['On-Base Plus Slugging'].mean()

            # bonus
            # whole team
            hits_conversion_rate_total = rbi_sum / hits_sum
            at_bats_conversion_rate_total = rbi_sum / ab_sum
            
            # per player
            hits_conversion_rate_mean = hits_conversion_rate_total / n 
            at_bats_conversion_rate_mean = at_bats_conversion_rate_total / n
            
            convert = [team, 
                       year, 
                       ab_sum,
                       hits_sum,
                       avg_sum,
                       rbi_sum,
                       obp_sum,
                       slg_sum,
                       ops_sum,
                       hits_conversion_rate_total,
                       at_bats_conversion_rate_total,
                       ab_mean,
                       hits_mean,
                       avg_mean,
                       rbi_mean,
                       obp_mean,
                       slg_mean,
                       ops_mean,
                       hits_conversion_rate_mean,
                       at_bats_conversion_rate_mean]

            cols = ['team', 
                    'year', 
                    'team_ab',
                    'team_hits',
                    'team_avg',
                    'team_rbi',
                    'team_obp',
                    'team_slg',
                    'team_ops',
                    'team_hits_conversion_rate',  # teams total rbi conversion from hits
                    'team_at_bats_conversion_rate', # teams total rbi conversion from at bats
                    'ab_mean',
                    'hits_mean',
                    'avg_mean',
                    'rbi_mean',
                    'obp_mean',
                    'slg_mean',
                    'ops_mean',
                    'hits_conversion_rate_mean', # per player average rbi conversion from hits
                    'at_bats_conversion_rate_mean'] # per player average rbi conversion from at bats
            
            df = pd.DataFrame(convert).T
            df.columns = cols
            
            return df
        
        except ValueError:
            pass        

### Examples

In [12]:
extra_stats('SF', 2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [13]:
extra_stats('SF', 2022)

Unnamed: 0,team,year,team_ab,team_hits,team_avg,team_rbi,team_obp,team_slg,team_ops,team_hits_conversion_rate,team_at_bats_conversion_rate,ab_mean,hits_mean,avg_mean,rbi_mean,obp_mean,slg_mean,ops_mean,hits_conversion_rate_mean,at_bats_conversion_rate_mean
0,SF,2022,5148,1207,5.69,639,7.931,9.103,17.034,0.529412,0.124126,190.667,44.7037,0.210741,23.6667,0.293741,0.337148,0.630889,0.0196078,0.00459725


## `clean_dates`
Takes regular season games df and converts game day info to pandas date time format.

In [14]:
def clean_dates(year):
    # ensure year is string
    year = make_year_string(year)
    
    # ensure year is valid
    if year in ['2022', '2021', '2019', '2018', '2017']:
        if year == '2017':
            df = games_2017
        
        elif year == '2018':
            df = games_2018

        elif year == '2019':
            df = games_2019

        elif year == '2021':
            df = games_2021

        else: 
            df = games_2022


        # --- add a pandas datetime column ---
        new_df = df.copy()
        # empty list will be new column
        new_dates = []

        for index, row in df.iterrows():
            new_date = year + "-" + new_df['Month'][index] + "-" + new_df['Date'][index]
            new_dates.append(new_date)

        # convert to pd.to_datetime and overwrite current 'Date' column
        new_df['Date'] = pd.to_datetime(new_dates)

        # drop 'Month' and 'Day' columns
        new_df.drop(['Month', 'Day'], axis=1, inplace=True)


        return new_df 

### Examples

In [15]:
clean_dates(2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [16]:
clean_dates(2022)

Unnamed: 0,Date,Away,Home,Win,W Score,Lose,L Score
0,2022-04-07,MIL,CHC,CHC,5,MIL,4
1,2022-04-07,CLE,KC,KC,3,CLE,1
2,2022-04-07,PIT,STL,STL,9,PIT,0
3,2022-04-07,NYM,WSH,NYM,5,WSH,1
4,2022-04-07,CIN,ATL,CIN,6,ATL,3
...,...,...,...,...,...,...,...
2426,2022-10-05,PHI,HOU,HOU,3,PHI,2
2427,2022-10-05,MIN,CWS,MIN,10,CWS,1
2428,2022-10-05,ATL,MIA,MIA,12,ATL,9
2429,2022-10-05,AZ,MIL,AZ,4,MIL,2


## `wins_lose_stats`
Creates new df from regular season games df with each teams annual win rate and the year.

In [17]:
def win_percentage(year):
    
    year_df = clean_dates(year)

    if type(year_df) == pd.core.frame.DataFrame:
                
        # get number of wins per team and make a dict
        team_wins = dict(year_df.groupby('Win').count()["Date"])
        
        #only teams in 'agg_stats', dropping countries, ex. USA COL PRU
        for team in list(team_wins.keys()):
            if team not in MLB_5_seasons['Team'].unique():
                del team_wins[team]

        # get number of loses per team and make a dict
        team_loses = dict(year_df.groupby('Lose').count()["Date"])
        
        for team in list(team_loses.keys()):
            if team not in MLB_5_seasons['Team'].unique():
                del team_loses[team]
        

        # get annual percentage of wins
        annual_percent_wins = {k: round((v/(v+team_loses[k])), 3) for (k,v) in team_wins.items()}

        per_win_without_yr = pd.DataFrame.from_dict(annual_percent_wins, orient='index', columns=[year])

        
        return per_win_without_yr

### Examples

In [18]:
win_percentage(2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [19]:
win_percentage(2022)

Unnamed: 0,2022
ATL,0.623
AZ,0.457
BAL,0.512
BOS,0.481
CHC,0.457
CIN,0.383
CLE,0.568
COL,0.42
CWS,0.5
DET,0.407


# Create modeling DF

## Create `agg_stats_df`
5 seasons of team stats. Each row is teams annual stats (gives team name and year)

In [20]:
mlb_teams = MLB_5_seasons['Team'].unique()
years = MLB_5_seasons['Year'].unique()

In [21]:
team_stats_dfs = []
for year in years:
    year_df = MLB_5_seasons[MLB_5_seasons['Year']==year]
    for team in mlb_teams:
        team_df = year_df[year_df['Team']==team]
        team_stats = extra_stats(team, year)
        team_stats_dfs.append(team_stats)

In [22]:
agg_stats_df = pd.concat(team_stats_dfs, ignore_index=True)   
agg_stats_df

Unnamed: 0,team,year,team_ab,team_hits,team_avg,team_rbi,team_obp,team_slg,team_ops,team_hits_conversion_rate,team_at_bats_conversion_rate,ab_mean,hits_mean,avg_mean,rbi_mean,obp_mean,slg_mean,ops_mean,hits_conversion_rate_mean,at_bats_conversion_rate_mean
0,NYM,2022,6044,1539,4.883,824,6.46,8.046,14.506,0.535413,0.136334,251.833,64.125,0.203458,34.3333,0.269167,0.33525,0.604417,0.0223089,0.00568056
1,ATL,2022,5920,1469,4.644,784,6.004,7.648,13.652,0.533696,0.132432,281.905,69.9524,0.221143,37.3333,0.285905,0.36419,0.650095,0.0254141,0.00630631
2,LAD,2022,5690,1439,4.324,832,5.606,7.495,13.101,0.578179,0.146221,316.111,79.9444,0.240222,46.2222,0.311444,0.416389,0.727833,0.0321211,0.00812341
3,TOR,2022,6133,1598,4.603,816,5.856,7.021,12.877,0.510638,0.133051,360.765,94,0.270765,48,0.344471,0.413,0.757471,0.0300375,0.00782651
4,NYY,2022,5683,1422,4.318,792,5.715,7.354,13.069,0.556962,0.139363,299.105,74.8421,0.227263,41.6842,0.300789,0.387053,0.687842,0.0293138,0.0073349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,MIL,2017,5779,1450,5.214,741,6.906,8.341,15.247,0.511034,0.128223,222.269,55.7692,0.200538,28.5,0.265615,0.320808,0.586423,0.0196552,0.00493165
146,TB,2017,5911,1421,5.65,723,7.389,9.456,16.845,0.508797,0.122314,236.44,56.84,0.226,28.92,0.29556,0.37824,0.6738,0.0203519,0.00489257
147,CHC,2017,5870,1485,5.452,835,7.356,8.85,16.206,0.56229,0.142249,217.407,55,0.201926,30.9259,0.272444,0.327778,0.600222,0.0208255,0.00526847
148,BAL,2017,5824,1520,5.616,738,6.658,9.289,15.947,0.485526,0.126717,253.217,66.087,0.244174,32.087,0.289478,0.40387,0.693348,0.0211098,0.00550944


In [23]:
# view as a list for easy reading
list(agg_stats_df.columns)

['team',
 'year',
 'team_ab',
 'team_hits',
 'team_avg',
 'team_rbi',
 'team_obp',
 'team_slg',
 'team_ops',
 'team_hits_conversion_rate',
 'team_at_bats_conversion_rate',
 'ab_mean',
 'hits_mean',
 'avg_mean',
 'rbi_mean',
 'obp_mean',
 'slg_mean',
 'ops_mean',
 'hits_conversion_rate_mean',
 'at_bats_conversion_rate_mean']

## Create 5 seasons of win percentages

In [24]:
join_these = []
for year in years:
    join_these.append(win_percentage(year))
    
win_percent_by_year = pd.concat(join_these, axis=1)
win_percent_by_year

Unnamed: 0,2022,2021,2019,2018,2017
ATL,0.623,0.543,0.599,0.556,0.444
AZ,0.457,0.325,0.525,0.506,0.574
BAL,0.512,0.321,0.333,0.29,0.463
BOS,0.481,0.568,0.521,0.667,0.574
CHC,0.457,0.438,0.519,0.585,0.568
CIN,0.383,0.506,0.463,0.414,0.42
CLE,0.568,0.494,0.574,0.562,0.63
COL,0.42,0.457,0.438,0.558,0.537
CWS,0.5,0.571,0.451,0.383,0.414
DET,0.407,0.475,0.29,0.395,0.395


# Add teams annual win rate to `agg_stats_df`

## Add new column, fill with zeros

In [25]:
agg_stats_df['% wins'] = 0

agg_stats_df.tail()

Unnamed: 0,team,year,team_ab,team_hits,team_avg,team_rbi,team_obp,team_slg,team_ops,team_hits_conversion_rate,...,ab_mean,hits_mean,avg_mean,rbi_mean,obp_mean,slg_mean,ops_mean,hits_conversion_rate_mean,at_bats_conversion_rate_mean,% wins
145,MIL,2017,5779,1450,5.214,741,6.906,8.341,15.247,0.511034,...,222.269,55.7692,0.200538,28.5,0.265615,0.320808,0.586423,0.0196552,0.00493165,0
146,TB,2017,5911,1421,5.65,723,7.389,9.456,16.845,0.508797,...,236.44,56.84,0.226,28.92,0.29556,0.37824,0.6738,0.0203519,0.00489257,0
147,CHC,2017,5870,1485,5.452,835,7.356,8.85,16.206,0.56229,...,217.407,55.0,0.201926,30.9259,0.272444,0.327778,0.600222,0.0208255,0.00526847,0
148,BAL,2017,5824,1520,5.616,738,6.658,9.289,15.947,0.485526,...,253.217,66.087,0.244174,32.087,0.289478,0.40387,0.693348,0.0211098,0.00550944,0
149,DET,2017,4701,1190,4.703,537,5.837,7.014,12.851,0.451261,...,235.05,59.5,0.23515,26.85,0.29185,0.3507,0.64255,0.022563,0.00571155,0


## Fill with appropiate values

In [26]:
for index, row in agg_stats_df.iterrows():
    year = agg_stats_df.loc[index, 'year']
    team = agg_stats_df.loc[index, 'team']
    percent = win_percent_by_year.loc[team,year]

    agg_stats_df.loc[index, '% wins'] = percent
    
agg_stats_df

Unnamed: 0,team,year,team_ab,team_hits,team_avg,team_rbi,team_obp,team_slg,team_ops,team_hits_conversion_rate,...,ab_mean,hits_mean,avg_mean,rbi_mean,obp_mean,slg_mean,ops_mean,hits_conversion_rate_mean,at_bats_conversion_rate_mean,% wins
0,NYM,2022,6044,1539,4.883,824,6.46,8.046,14.506,0.535413,...,251.833,64.125,0.203458,34.3333,0.269167,0.33525,0.604417,0.0223089,0.00568056,0.623
1,ATL,2022,5920,1469,4.644,784,6.004,7.648,13.652,0.533696,...,281.905,69.9524,0.221143,37.3333,0.285905,0.36419,0.650095,0.0254141,0.00630631,0.623
2,LAD,2022,5690,1439,4.324,832,5.606,7.495,13.101,0.578179,...,316.111,79.9444,0.240222,46.2222,0.311444,0.416389,0.727833,0.0321211,0.00812341,0.685
3,TOR,2022,6133,1598,4.603,816,5.856,7.021,12.877,0.510638,...,360.765,94,0.270765,48,0.344471,0.413,0.757471,0.0300375,0.00782651,0.568
4,NYY,2022,5683,1422,4.318,792,5.715,7.354,13.069,0.556962,...,299.105,74.8421,0.227263,41.6842,0.300789,0.387053,0.687842,0.0293138,0.0073349,0.611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,MIL,2017,5779,1450,5.214,741,6.906,8.341,15.247,0.511034,...,222.269,55.7692,0.200538,28.5,0.265615,0.320808,0.586423,0.0196552,0.00493165,0.531
146,TB,2017,5911,1421,5.65,723,7.389,9.456,16.845,0.508797,...,236.44,56.84,0.226,28.92,0.29556,0.37824,0.6738,0.0203519,0.00489257,0.494
147,CHC,2017,5870,1485,5.452,835,7.356,8.85,16.206,0.56229,...,217.407,55,0.201926,30.9259,0.272444,0.327778,0.600222,0.0208255,0.00526847,0.568
148,BAL,2017,5824,1520,5.616,738,6.658,9.289,15.947,0.485526,...,253.217,66.087,0.244174,32.087,0.289478,0.40387,0.693348,0.0211098,0.00550944,0.463


# Save FINAL MODELING DF

In [27]:
pd.to_pickle(agg_stats_df, "../pickled_tables/MODELING_DF.pkl")

In [28]:
modeling_df = pd.read_pickle("../pickled_tables/MODELING_DF.pkl")

In [29]:
modeling_df

Unnamed: 0,team,year,team_ab,team_hits,team_avg,team_rbi,team_obp,team_slg,team_ops,team_hits_conversion_rate,...,ab_mean,hits_mean,avg_mean,rbi_mean,obp_mean,slg_mean,ops_mean,hits_conversion_rate_mean,at_bats_conversion_rate_mean,% wins
0,NYM,2022,6044,1539,4.883,824,6.46,8.046,14.506,0.535413,...,251.833,64.125,0.203458,34.3333,0.269167,0.33525,0.604417,0.0223089,0.00568056,0.623
1,ATL,2022,5920,1469,4.644,784,6.004,7.648,13.652,0.533696,...,281.905,69.9524,0.221143,37.3333,0.285905,0.36419,0.650095,0.0254141,0.00630631,0.623
2,LAD,2022,5690,1439,4.324,832,5.606,7.495,13.101,0.578179,...,316.111,79.9444,0.240222,46.2222,0.311444,0.416389,0.727833,0.0321211,0.00812341,0.685
3,TOR,2022,6133,1598,4.603,816,5.856,7.021,12.877,0.510638,...,360.765,94,0.270765,48,0.344471,0.413,0.757471,0.0300375,0.00782651,0.568
4,NYY,2022,5683,1422,4.318,792,5.715,7.354,13.069,0.556962,...,299.105,74.8421,0.227263,41.6842,0.300789,0.387053,0.687842,0.0293138,0.0073349,0.611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,MIL,2017,5779,1450,5.214,741,6.906,8.341,15.247,0.511034,...,222.269,55.7692,0.200538,28.5,0.265615,0.320808,0.586423,0.0196552,0.00493165,0.531
146,TB,2017,5911,1421,5.65,723,7.389,9.456,16.845,0.508797,...,236.44,56.84,0.226,28.92,0.29556,0.37824,0.6738,0.0203519,0.00489257,0.494
147,CHC,2017,5870,1485,5.452,835,7.356,8.85,16.206,0.56229,...,217.407,55,0.201926,30.9259,0.272444,0.327778,0.600222,0.0208255,0.00526847,0.568
148,BAL,2017,5824,1520,5.616,738,6.658,9.289,15.947,0.485526,...,253.217,66.087,0.244174,32.087,0.289478,0.40387,0.693348,0.0211098,0.00550944,0.463


In [30]:
list(modeling_df.columns)

['team',
 'year',
 'team_ab',
 'team_hits',
 'team_avg',
 'team_rbi',
 'team_obp',
 'team_slg',
 'team_ops',
 'team_hits_conversion_rate',
 'team_at_bats_conversion_rate',
 'ab_mean',
 'hits_mean',
 'avg_mean',
 'rbi_mean',
 'obp_mean',
 'slg_mean',
 'ops_mean',
 'hits_conversion_rate_mean',
 'at_bats_conversion_rate_mean',
 '% wins']

In [31]:
modeling_df.shape

(150, 21)