# Introduction

This notebook takes DFs from INSERT and INSERT and creates a new df to be modeled with. Part of this process cleans date, counts wins, loses, gets win percentage for each team for each year. creates new features of teams aggrigated stats. Final DF has TEAM, YEAR, TEAM STATS, WIN PERCENTAGE 

# Imports

In [1]:
import pandas as pd

# Load Needed DF's

## `MLB_5_seasons`
The last 5 seasons for MLB, starting at 2022, skipping 2020 due to covid, ending at 2017.

In [2]:
# load
MLB_5_seasons = pd.read_pickle("../pickled_tables/MLB_5_seasons_df.pkl")

# view
MLB_5_seasons

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,Year,Player Name,Position
0,NYM,2,2,1,1,0,0,1,3,0,0,0,0,0.500,0.500,2.000,2.500,2022,Khalil Lee,CF
1,ATL,1,4,0,3,2,0,0,3,0,0,0,0,0.750,0.750,1.250,2.000,2022,Chadwick Tromp,C
2,LAD,4,13,6,6,2,0,1,3,2,7,0,0,0.462,0.563,0.846,1.409,2022,James Outman,LF
3,TOR,8,9,0,6,0,0,0,3,1,1,0,1,0.667,0.700,0.667,1.367,2022,Otto Lopez,SS
4,NYY,47,128,28,39,9,0,15,37,19,35,0,0,0.305,0.412,0.727,1.139,2022,Matt Carpenter,DH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5949,BAL,9,1,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Gabriel Ynoa,P
5950,CWS,22,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Michael Ynoa,P
5951,KC,14,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Chris Young,P
5952,MIA,53,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2017,Brad Ziegler,P


## Load df's with regular season games

In [3]:
# load
games_2017 = pd.read_pickle("../pickled_tables/2017_df.pkl")
games_2018 = pd.read_pickle("../pickled_tables/2018_df.pkl")
games_2019 = pd.read_pickle("../pickled_tables/2019_df.pkl")
games_2021 = pd.read_pickle("../pickled_tables/2021_df.pkl")
games_2022 = pd.read_pickle("../pickled_tables/2022_df.pkl")

# view 1
games_2017

Unnamed: 0,Day,Month,Date,Away,Home,Win,W Score,Lose,L Score
0,Sunday,Apr,2,NYY,TB,TB,7,NYY,3
1,Sunday,Apr,2,SF,AZ,AZ,6,SF,5
2,Sunday,Apr,2,CHC,STL,STL,4,CHC,3
3,Monday,Apr,3,MIA,WSH,WSH,4,MIA,2
4,Monday,Apr,3,ATL,NYM,NYM,6,ATL,0
...,...,...,...,...,...,...,...,...,...
2425,Sunday,Oct,1,CWS,CLE,CLE,3,CWS,1
2426,Sunday,Oct,1,ATL,MIA,ATL,8,MIA,5
2427,Sunday,Oct,1,AZ,KC,AZ,14,KC,2
2428,Sunday,Oct,1,MIL,STL,MIL,6,STL,1


# Functions
To streamline the workflow I put most of the df transformations into functions

## `make_year_string`
This function is simply to make is easier to input year and ensure a valid year is input.

In [4]:
def make_year_string(year):
    if type(year) == int:
        year = str(year)
        
    if year not in ['2022', '2021', '2019', '2018', '2017']:
        print("ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']")
    else:
        return year

### Examples

In [5]:
make_year_string(2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [6]:
make_year_string(2022)

'2022'

## `get_team_to_agg`
("Get team to aggrigate") 

This function selects a team and a year and filters out the players with less than 5 at bats.


In [7]:
def get_team_to_agg(team, year):
    
    # ensure year is string
    year = make_year_string(year)
    
    # ensure year is valid
    if year in ['2022', '2021', '2019', '2018', '2017']:
        entire_team = MLB_5_seasons[MLB_5_seasons['Year']==year][MLB_5_seasons[MLB_5_seasons['Year']==year]['Team'] == team]

        return entire_team[entire_team['At Bats'] > 5]

### Examples

In [8]:
get_team_to_agg('SF', 2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [9]:
get_team_to_agg('SF', 2022)

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,Year,Player Name,Position
30,SF,134,380,57,104,19,3,23,70,42,100,3,2,0.274,0.353,0.521,0.874,2022,Joc Pederson,LF
68,SF,3,8,1,3,0,0,0,0,1,0,0,0,0.375,0.444,0.375,0.819,2022,Austin Dean,LF
78,SF,36,98,14,26,6,1,4,12,10,29,1,1,0.265,0.342,0.469,0.811,2022,Jason Vosler,3B
111,SF,52,156,21,36,6,1,9,24,18,58,0,1,0.231,0.331,0.455,0.786,2022,David Villar,3B
118,SF,125,277,49,73,15,2,7,34,40,89,12,1,0.264,0.366,0.408,0.774,2022,Austin Slater,CF
134,SF,89,266,31,65,13,0,14,42,27,83,0,0,0.244,0.315,0.451,0.766,2022,Evan Longoria,3B
142,SF,115,318,46,79,16,1,12,35,39,122,1,1,0.248,0.34,0.418,0.758,2022,J.D. Davis,3B
210,SF,140,488,71,127,22,2,14,62,33,89,21,6,0.26,0.322,0.4,0.722,2022,Thairo Estrada,2B
229,SF,151,525,72,120,28,1,19,71,59,103,0,0,0.229,0.316,0.394,0.71,2022,Wilmer Flores,2B
262,SF,148,485,73,104,31,2,17,57,61,141,5,1,0.214,0.305,0.392,0.697,2022,Mike Yastrzemski,RF


## `extra_stats`
Create stats for teams by year. Use `get_team_to_agg` to select a team by year and create a dataframe with team stats for that season.

In [10]:
def extra_stats(team, year):
    
    # ensure year is string
    year = make_year_string(year)
    
    # ensure year is valid
    if year in ['2022', '2021', '2019', '2018', '2017']:
        try:
            
            # cumulative totals - sum all players with more than 10 at bats
            # avoid stats that are combined (formulas, ex avg, obp, slg)
            g_sum = get_team_to_agg(team, year)['Games Played'].sum()
            ab_sum = get_team_to_agg(team, year)['At Bats'].sum()
            runs_sum = get_team_to_agg(team, year)['Runs'].sum()
            hits_sum = get_team_to_agg(team, year)['Hits'].sum()
            b2_sum = get_team_to_agg(team, year)['Doubles'].sum()
            b3_sum = get_team_to_agg(team, year)['Triples'].sum()
            hr_sum = get_team_to_agg(team, year)['Home Runs'].sum()
            rbi_sum = get_team_to_agg(team, year)['Runs Batted In'].sum()
            walks_sum = get_team_to_agg(team, year)['Walks'].sum()
            so_sum = get_team_to_agg(team, year)['Strikeouts'].sum()
            stolen_sum = get_team_to_agg(team, year)['Stolen Bases'].sum()
            cs_sum = get_team_to_agg(team, year)['Caught Stealing'].sum()
            
            
            # averages - average of players with more than 5 At Bats
            g_mean = get_team_to_agg(team, year)['Games Played'].mean()
            ab_mean = get_team_to_agg(team, year)['At Bats'].mean()
            runs_mean = get_team_to_agg(team, year)['Runs'].mean()
            hits_mean = get_team_to_agg(team, year)['Hits'].mean()
            b2_mean = get_team_to_agg(team, year)['Doubles'].mean()
            b3_mean = get_team_to_agg(team, year)['Triples'].mean()
            hr_mean = get_team_to_agg(team, year)['Home Runs'].mean()
            rbi_mean = get_team_to_agg(team, year)['Runs Batted In'].mean()
            walks_mean = get_team_to_agg(team, year)['Walks'].mean()
            so_mean = get_team_to_agg(team, year)['Strikeouts'].mean()
            stolen_mean = get_team_to_agg(team, year)['Stolen Bases'].mean()
            cs_mean = get_team_to_agg(team, year)['Caught Stealing'].mean()

            
            convert = [team, 
                       year, 
                       g_sum,
                       ab_sum,
                       runs_sum,
                       hits_sum,
                       b2_sum, 
                       b3_sum, 
                       hr_sum,
                       rbi_sum,
                       walks_sum,
                       so_sum,
                       stolen_sum,
                       cs_sum, 
                       g_mean, 
                       ab_mean, 
                       runs_mean, 
                       hits_mean, 
                       b2_mean, 
                       b3_mean, 
                       hr_mean, 
                       rbi_mean, 
                       walks_mean, 
                       so_mean, 
                       stolen_mean, 
                       cs_mean]

            cols = ['Team', 
                    'Year', 
                    'Games Played Sum',
                    'At Bats Sum', 
                    'Runs Sum',
                    'Hits Sum',
                    'Doubles Sum',
                    'Triples Sum',
                    'Home Runs Sum',
                    'Runs Batted In Sum',
                    'Walks Sum',
                    'Strikeouts Sum',
                    'Stolen Bases Sum',
                    'Caught Stealing Sum',
                    'Mean Games Played',
                    'Mean At Bats',
                    'Mean Runs',
                    'Mean Hits',
                    'Mean Doubles',
                    'Mean Triples',
                    'Mean Home Runs',
                    'Mean Runs Batted In',
                    'Mean Walks',
                    'Mean Strikeouts',
                    'Mean Stolen Bases',
                    'Mean Caught Stealing']
            
            df = pd.DataFrame(convert).T
            df.columns = cols
            
            return df
        
        except ValueError:
            pass        

### Examples

In [11]:
extra_stats('SF', 2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [12]:
extra_stats('SF', 2022)

Unnamed: 0,Team,Year,Games Played Sum,At Bats Sum,Runs Sum,Hits Sum,Doubles Sum,Triples Sum,Home Runs Sum,Runs Batted In Sum,...,Mean Runs,Mean Hits,Mean Doubles,Mean Triples,Mean Home Runs,Mean Runs Batted In,Mean Walks,Mean Strikeouts,Mean Stolen Bases,Mean Caught Stealing
0,SF,2022,1732,5148,678,1207,250,19,171,639,...,25.1111,44.7037,9.25926,0.703704,6.33333,23.6667,20.1111,51.4815,2.18519,0.62963


## `clean_dates`
Takes regular season games df and converts game day info to pandas date time format.

In [13]:
def clean_dates(year):
    # ensure year is string
    year = make_year_string(year)
    
    # ensure year is valid
    if year in ['2022', '2021', '2019', '2018', '2017']:
        if year == '2017':
            df = games_2017
        
        elif year == '2018':
            df = games_2018

        elif year == '2019':
            df = games_2019

        elif year == '2021':
            df = games_2021

        else: 
            df = games_2022


        # --- add a pandas datetime column ---
        new_df = df.copy()
        # empty list will be new column
        new_dates = []

        for index, row in df.iterrows():
            new_date = year + "-" + new_df['Month'][index] + "-" + new_df['Date'][index]
            new_dates.append(new_date)

        # convert to pd.to_datetime and overwrite current 'Date' column
        new_df['Date'] = pd.to_datetime(new_dates)

        # drop 'Month' and 'Day' columns
        new_df.drop(['Month', 'Day'], axis=1, inplace=True)


        return new_df 

### Examples

In [14]:
clean_dates(2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [15]:
clean_dates(2022)

Unnamed: 0,Date,Away,Home,Win,W Score,Lose,L Score
0,2022-04-07,MIL,CHC,CHC,5,MIL,4
1,2022-04-07,CLE,KC,KC,3,CLE,1
2,2022-04-07,PIT,STL,STL,9,PIT,0
3,2022-04-07,NYM,WSH,NYM,5,WSH,1
4,2022-04-07,CIN,ATL,CIN,6,ATL,3
...,...,...,...,...,...,...,...
2426,2022-10-05,PHI,HOU,HOU,3,PHI,2
2427,2022-10-05,MIN,CWS,MIN,10,CWS,1
2428,2022-10-05,ATL,MIA,MIA,12,ATL,9
2429,2022-10-05,AZ,MIL,AZ,4,MIL,2


## `wins_lose_stats`
Creates new df from regular season games df with each teams annual win rate and the year.

In [16]:
def win_percentage(year):
    
    year_df = clean_dates(year)

    if type(year_df) == pd.core.frame.DataFrame:
                
        # get number of wins per team and make a dict
        team_wins = dict(year_df.groupby('Win').count()["Date"])
        
        #only teams in 'agg_stats', dropping countries, ex. USA COL PRU
        for team in list(team_wins.keys()):
            if team not in MLB_5_seasons['Team'].unique():
                del team_wins[team]

        # get number of loses per team and make a dict
        team_loses = dict(year_df.groupby('Lose').count()["Date"])
        
        for team in list(team_loses.keys()):
            if team not in MLB_5_seasons['Team'].unique():
                del team_loses[team]
        

        # get annual percentage of wins
        annual_percent_wins = {k: round((v/(v+team_loses[k]))*100, 2) for (k,v) in team_wins.items()}

        per_win_without_yr = pd.DataFrame.from_dict(annual_percent_wins, orient='index', columns=[year])

        
        return per_win_without_yr

### Examples

In [17]:
win_percentage(2020)

ValueError: Enter a valid year ['2022', '2021', '2019', '2018', '2017']


In [18]:
win_percentage(2022)

Unnamed: 0,2022
ATL,62.35
AZ,45.68
BAL,51.23
BOS,48.15
CHC,45.68
CIN,38.27
CLE,56.79
COL,41.98
CWS,50.0
DET,40.74


# Create modeling DF

## Create `agg_stats_df`
5 seasons of team stats. Each row is teams annual stats (gives team name and year)

In [19]:
mlb_teams = MLB_5_seasons['Team'].unique()
years = MLB_5_seasons['Year'].unique()

In [20]:
team_stats_dfs = []
for year in years:
    year_df = MLB_5_seasons[MLB_5_seasons['Year']==year]
    for team in mlb_teams:
        team_df = year_df[year_df['Team']==team]
        team_stats = extra_stats(team, year)
        team_stats_dfs.append(team_stats)

In [21]:
agg_stats_df = pd.concat(team_stats_dfs, ignore_index=True)   
agg_stats_df

Unnamed: 0,Team,Year,Games Played Sum,At Bats Sum,Runs Sum,Hits Sum,Doubles Sum,Triples Sum,Home Runs Sum,Runs Batted In Sum,...,Mean Runs,Mean Hits,Mean Doubles,Mean Triples,Mean Home Runs,Mean Runs Batted In,Mean Walks,Mean Strikeouts,Mean Stolen Bases,Mean Caught Stealing
0,NYM,2022,1845,6044,844,1539,295,29,201,824,...,35.1667,64.125,12.2917,1.20833,8.375,34.3333,24.4167,57.25,2.66667,1.04167
1,ATL,2022,1705,5920,818,1469,311,12,246,784,...,38.9524,69.9524,14.8095,0.571429,11.7143,37.3333,24.619,77.5238,4.33333,1.52381
2,LAD,2022,1661,5690,866,1439,324,31,221,832,...,48.1111,79.9444,18,1.72222,12.2778,46.2222,35.3889,80.8333,5.55556,1
3,TOR,2022,1851,6133,840,1598,344,14,205,816,...,49.4118,94,20.2353,0.823529,12.0588,48,31.7059,78.1765,4.94118,2.41176
4,NYY,2022,1729,5683,841,1422,239,12,249,792,...,44.2632,74.8421,12.5789,0.631579,13.1053,41.6842,33.1053,71.6842,6.05263,1.94737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,MIL,2017,1959,5779,777,1450,286,24,236,741,...,29.8846,55.7692,11,0.923077,9.07692,28.5,22.4231,62.1923,4.76923,1.61538
146,TB,2017,1902,5911,753,1421,260,30,247,723,...,30.12,56.84,10.4,1.2,9.88,28.92,23.84,66.84,3.6,1.48
147,CHC,2017,2056,5870,865,1485,285,30,241,835,...,32.037,55,10.5556,1.11111,8.92593,30.9259,24.7407,56.6296,2.48148,1.37037
148,BAL,2017,1728,5824,763,1520,270,15,243,738,...,33.1739,66.087,11.7391,0.652174,10.5652,32.087,17.5652,64.6522,1.6087,0.73913


In [22]:
# view as a list for easy reading
list(agg_stats_df.columns)

['Team',
 'Year',
 'Games Played Sum',
 'At Bats Sum',
 'Runs Sum',
 'Hits Sum',
 'Doubles Sum',
 'Triples Sum',
 'Home Runs Sum',
 'Runs Batted In Sum',
 'Walks Sum',
 'Strikeouts Sum',
 'Stolen Bases Sum',
 'Caught Stealing Sum',
 'Mean Games Played',
 'Mean At Bats',
 'Mean Runs',
 'Mean Hits',
 'Mean Doubles',
 'Mean Triples',
 'Mean Home Runs',
 'Mean Runs Batted In',
 'Mean Walks',
 'Mean Strikeouts',
 'Mean Stolen Bases',
 'Mean Caught Stealing']

## Create 5 seasons of win percentages

In [23]:
join_these = []
for year in years:
    join_these.append(win_percentage(year))
    
win_percent_by_year = pd.concat(join_these, axis=1)
win_percent_by_year

Unnamed: 0,2022,2021,2019,2018,2017
ATL,62.35,54.32,59.88,55.56,44.44
AZ,45.68,32.52,52.47,50.62,57.41
BAL,51.23,32.1,33.33,29.01,46.3
BOS,48.15,56.79,52.15,66.67,57.41
CHC,45.68,43.83,51.85,58.54,56.79
CIN,38.27,50.61,46.3,41.36,41.98
CLE,56.79,49.38,57.41,56.17,62.96
COL,41.98,45.68,43.83,55.83,53.7
CWS,50.0,57.06,45.06,38.27,41.36
DET,40.74,47.53,29.01,39.51,39.51


# Add teams annual win rate to `agg_stats_df`

## Add new column, fill with zeros

In [24]:
agg_stats_df['% wins'] = 0

agg_stats_df.tail()

Unnamed: 0,Team,Year,Games Played Sum,At Bats Sum,Runs Sum,Hits Sum,Doubles Sum,Triples Sum,Home Runs Sum,Runs Batted In Sum,...,Mean Hits,Mean Doubles,Mean Triples,Mean Home Runs,Mean Runs Batted In,Mean Walks,Mean Strikeouts,Mean Stolen Bases,Mean Caught Stealing,% wins
145,MIL,2017,1959,5779,777,1450,286,24,236,741,...,55.7692,11.0,0.923077,9.07692,28.5,22.4231,62.1923,4.76923,1.61538,0
146,TB,2017,1902,5911,753,1421,260,30,247,723,...,56.84,10.4,1.2,9.88,28.92,23.84,66.84,3.6,1.48,0
147,CHC,2017,2056,5870,865,1485,285,30,241,835,...,55.0,10.5556,1.11111,8.92593,30.9259,24.7407,56.6296,2.48148,1.37037,0
148,BAL,2017,1728,5824,763,1520,270,15,243,738,...,66.087,11.7391,0.652174,10.5652,32.087,17.5652,64.6522,1.6087,0.73913,0
149,DET,2017,1475,4701,588,1190,230,33,133,537,...,59.5,11.5,1.65,6.65,26.85,18.75,52.05,2.65,1.4,0


## Fill with appropiate values

In [25]:
for index, row in agg_stats_df.iterrows():
    year = agg_stats_df.loc[index, 'Year']
    team = agg_stats_df.loc[index, 'Team']
    percent = win_percent_by_year.loc[team,year]

    agg_stats_df.loc[index, '% wins'] = percent
    
agg_stats_df

Unnamed: 0,Team,Year,Games Played Sum,At Bats Sum,Runs Sum,Hits Sum,Doubles Sum,Triples Sum,Home Runs Sum,Runs Batted In Sum,...,Mean Hits,Mean Doubles,Mean Triples,Mean Home Runs,Mean Runs Batted In,Mean Walks,Mean Strikeouts,Mean Stolen Bases,Mean Caught Stealing,% wins
0,NYM,2022,1845,6044,844,1539,295,29,201,824,...,64.125,12.2917,1.20833,8.375,34.3333,24.4167,57.25,2.66667,1.04167,62.35
1,ATL,2022,1705,5920,818,1469,311,12,246,784,...,69.9524,14.8095,0.571429,11.7143,37.3333,24.619,77.5238,4.33333,1.52381,62.35
2,LAD,2022,1661,5690,866,1439,324,31,221,832,...,79.9444,18,1.72222,12.2778,46.2222,35.3889,80.8333,5.55556,1,68.52
3,TOR,2022,1851,6133,840,1598,344,14,205,816,...,94,20.2353,0.823529,12.0588,48,31.7059,78.1765,4.94118,2.41176,56.79
4,NYY,2022,1729,5683,841,1422,239,12,249,792,...,74.8421,12.5789,0.631579,13.1053,41.6842,33.1053,71.6842,6.05263,1.94737,61.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,MIL,2017,1959,5779,777,1450,286,24,236,741,...,55.7692,11,0.923077,9.07692,28.5,22.4231,62.1923,4.76923,1.61538,53.09
146,TB,2017,1902,5911,753,1421,260,30,247,723,...,56.84,10.4,1.2,9.88,28.92,23.84,66.84,3.6,1.48,49.38
147,CHC,2017,2056,5870,865,1485,285,30,241,835,...,55,10.5556,1.11111,8.92593,30.9259,24.7407,56.6296,2.48148,1.37037,56.79
148,BAL,2017,1728,5824,763,1520,270,15,243,738,...,66.087,11.7391,0.652174,10.5652,32.087,17.5652,64.6522,1.6087,0.73913,46.30


# Save FINAL MODELING DF

In [26]:
pd.to_pickle(agg_stats_df, "../pickled_tables/MODELING_DF.pkl")

In [27]:
modeling_df = pd.read_pickle("../pickled_tables/MODELING_DF.pkl")

In [28]:
modeling_df

Unnamed: 0,Team,Year,Games Played Sum,At Bats Sum,Runs Sum,Hits Sum,Doubles Sum,Triples Sum,Home Runs Sum,Runs Batted In Sum,...,Mean Hits,Mean Doubles,Mean Triples,Mean Home Runs,Mean Runs Batted In,Mean Walks,Mean Strikeouts,Mean Stolen Bases,Mean Caught Stealing,% wins
0,NYM,2022,1845,6044,844,1539,295,29,201,824,...,64.125,12.2917,1.20833,8.375,34.3333,24.4167,57.25,2.66667,1.04167,62.35
1,ATL,2022,1705,5920,818,1469,311,12,246,784,...,69.9524,14.8095,0.571429,11.7143,37.3333,24.619,77.5238,4.33333,1.52381,62.35
2,LAD,2022,1661,5690,866,1439,324,31,221,832,...,79.9444,18,1.72222,12.2778,46.2222,35.3889,80.8333,5.55556,1,68.52
3,TOR,2022,1851,6133,840,1598,344,14,205,816,...,94,20.2353,0.823529,12.0588,48,31.7059,78.1765,4.94118,2.41176,56.79
4,NYY,2022,1729,5683,841,1422,239,12,249,792,...,74.8421,12.5789,0.631579,13.1053,41.6842,33.1053,71.6842,6.05263,1.94737,61.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,MIL,2017,1959,5779,777,1450,286,24,236,741,...,55.7692,11,0.923077,9.07692,28.5,22.4231,62.1923,4.76923,1.61538,53.09
146,TB,2017,1902,5911,753,1421,260,30,247,723,...,56.84,10.4,1.2,9.88,28.92,23.84,66.84,3.6,1.48,49.38
147,CHC,2017,2056,5870,865,1485,285,30,241,835,...,55,10.5556,1.11111,8.92593,30.9259,24.7407,56.6296,2.48148,1.37037,56.79
148,BAL,2017,1728,5824,763,1520,270,15,243,738,...,66.087,11.7391,0.652174,10.5652,32.087,17.5652,64.6522,1.6087,0.73913,46.30


In [29]:
list(modeling_df.columns)

['Team',
 'Year',
 'Games Played Sum',
 'At Bats Sum',
 'Runs Sum',
 'Hits Sum',
 'Doubles Sum',
 'Triples Sum',
 'Home Runs Sum',
 'Runs Batted In Sum',
 'Walks Sum',
 'Strikeouts Sum',
 'Stolen Bases Sum',
 'Caught Stealing Sum',
 'Mean Games Played',
 'Mean At Bats',
 'Mean Runs',
 'Mean Hits',
 'Mean Doubles',
 'Mean Triples',
 'Mean Home Runs',
 'Mean Runs Batted In',
 'Mean Walks',
 'Mean Strikeouts',
 'Mean Stolen Bases',
 'Mean Caught Stealing',
 '% wins']

In [30]:
modeling_df.shape

(150, 27)

In [7]:
games_2022

Unnamed: 0,Day,Month,Date,Away,Home,Win,W Score,Lose,L Score
0,Thursday,Apr,7,MIL,CHC,CHC,5,MIL,4
1,Thursday,Apr,7,CLE,KC,KC,3,CLE,1
2,Thursday,Apr,7,PIT,STL,STL,9,PIT,0
3,Thursday,Apr,7,NYM,WSH,NYM,5,WSH,1
4,Thursday,Apr,7,CIN,ATL,CIN,6,ATL,3
...,...,...,...,...,...,...,...,...,...
2426,Wednesday,Oct,5,PHI,HOU,HOU,3,PHI,2
2427,Wednesday,Oct,5,MIN,CWS,MIN,10,CWS,1
2428,Wednesday,Oct,5,ATL,MIA,MIA,12,ATL,9
2429,Wednesday,Oct,5,AZ,MIL,AZ,4,MIL,2


In [13]:
MLB_5_seasons[MLB_5_seasons["Year"]=="2022"]

Unnamed: 0,Team,Games Played,At Bats,Runs,Hits,Doubles,Triples,Home Runs,Runs Batted In,Walks,Strikeouts,Stolen Bases,Caught Stealing,Batting Average,On-Base Percentage,Slugging Percentage,On-Base Plus Slugging,Year,Player Name,Position
0,NYM,2,2,1,1,0,0,1,3,0,0,0,0,0.500,0.500,2.000,2.500,2022,Khalil Lee,CF
1,ATL,1,4,0,3,2,0,0,3,0,0,0,0,0.750,0.750,1.250,2.000,2022,Chadwick Tromp,C
2,LAD,4,13,6,6,2,0,1,3,2,7,0,0,0.462,0.563,0.846,1.409,2022,James Outman,LF
3,TOR,8,9,0,6,0,0,0,3,1,1,0,1,0.667,0.700,0.667,1.367,2022,Otto Lopez,SS
4,NYY,47,128,28,39,9,0,15,37,19,35,0,0,0.305,0.412,0.727,1.139,2022,Matt Carpenter,DH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789,AZ,13,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2022,Luke Weaver,P
790,AZ,29,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2022,J.B. Wendelken,P
791,LAA,5,11,0,0,0,0,0,0,0,5,0,0,0.000,0.000,0.000,0.000,2022,Aaron Whitefield,RF
792,NYM,30,0,0,0,0,0,0,0,0,0,0,0,0.000,0.000,0.000,0.000,2022,Trevor Williams,P
