In [None]:
import pandas as pd
from pandas import DataFrame, Series

#### Reading .csv files to analize data

In [None]:
master_data = pd.read_csv('data/master.csv', index_col = 'playerID', usecols = [0, 13, 14])
master_data.head()

In [None]:
salary_data = pd.read_csv('data/salaries.csv', index_col = 'playerID', usecols = [3, 4])
salary_data.head()

In [None]:
batting_data = pd.read_csv('data/batting.csv', index_col = 'playerID', usecols = [0, 1, 5, 6, 8, 15, 18, 20])
batting_data.head()

In [None]:
batting_no_nan = batting_data.fillna(0)
batting_no_nan.head()

In [None]:
fielding_data = pd.read_csv('data/fielding.csv', index_col = 'playerID', usecols = [0, 5])
fielding_data.head()

In [None]:
name_position = master_data.merge(fielding_data, left_index=True, right_index=True)

In [None]:
name_position_salary = name_position.merge(salary_data, left_index=True, right_index=True)


In [None]:
total_info = name_position_salary.merge(batting_no_nan, left_index=True, right_index=True)
total_info.head()

### OBP = (H + BB + HBP) / (AB + BB + HBP + SF)

H - Hits;    BB - Base on Balls;    HBP - Hit by Pitch;    SF - Sacrifice flies

In [None]:
total_info['OBP'] = (total_info['H']+ total_info['BB']+ total_info['HBP']) / (total_info['AB']+ total_info['BB']+ total_info['HBP'] + total_info['SF'])
total_info.head()

### Filtering DataFrame 
For the data analysis there's no need to have players with no salary and players whose OBP is either 1 or 0.

In [None]:
total_info = total_info[total_info['OBP'] != 0]
total_info.head()

In [None]:
total_info = total_info[total_info['OBP'] != 1]
total_info.head()

In [None]:
total_info = total_info[total_info['salary'] != 0]
total_info.head()

In [None]:
total_info = total_info.dropna()
total_info.head()

##### We will consider players that played after 2014

In [None]:
total_info = total_info.query('yearID>2014')
total_info.head()

In [None]:
#total_info = total_info.sort(['OBP'], ascending = False)
total_info = total_info.sort_values(by = 'OBP', ascending = False)
total_info.head()

#### The roster doesn't include Designated Hitter (DH) and Outfielder (OF) positions

In [None]:
total_info = total_info[total_info['POS'] != 'DH']
total_info = total_info[total_info['POS'] != 'OF']
total_info.head()

In [None]:
total_info = total_info.drop_duplicates()
total_info.head()

In [None]:
total_info['OBP by salary'] = total_info['OBP'] / total_info['salary']
total_info.head()

In [None]:
total_info = total_info.sort_values(by = 'OBP by salary', ascending = False)
total_info.head()

In [None]:
total_info = total_info.drop_duplicates('salary')
total_info.head()

In [None]:
def roster(total_info):
    roster_list = []   
    pos_list = ['1B', '2B', '3B', 'RF', 'LF', 'CF', 'P', 'C','SS']
    for pos in pos_list:
        player_position = total_info[total_info['POS'] == pos]
        roster_list.append(player_position[:1])
    return roster_list

In [None]:
player_list = roster(total_info)
final_roster = pd.concat(player_list)
final_roster

#### Below is the minimum salary and maximum OBP for each position

In [None]:
total_info.groupby('POS').salary.min()

In [None]:
total_info.groupby('POS').OBP.max()