In [46]:
all_rosters_path = "../../data/Rosters/all_rosters.csv"
teams_path = "../../data/teams.csv"
first_game_path = "../../data/first_game.csv"
bacon_numbers_path = "../../data/bacon_numbers.csv"
MAX_BACON_NUMBER = 15  # Actual max is 9 right now. 15 adds some padding.

In [47]:
import pandas as pd
from igraph import Graph

In [48]:
all_rosters = pd.read_csv(all_rosters_path, index_col=None)
all_rosters.fillna("?", inplace=True)
all_rosters["full_name"] = all_rosters["first"] + " " + all_rosters["last"]
# print(all_rosters)

In [49]:
all_teams = pd.read_csv(teams_path, index_col=None)
teams = all_teams.groupby("team_id").agg(
    min_year=pd.NamedAgg("year_id", "min"),
    max_year=pd.NamedAgg("year_id", "max"),
    full_name=pd.NamedAgg("name", "first")
)
teams["id"] = teams.index
teams["type"] = "team"
# print(all_teams)

In [50]:
team_names = all_teams.loc[:, ["name", "team_id"]]
team_names = team_names.groupby("team_id").agg(team_name=pd.NamedAgg("name", "first"))

In [51]:
all_rosters["team_id"] = all_rosters["team_id"] + all_rosters["year"].map(str)
all_rosters = all_rosters.merge(team_names, how="left", on="team_id")
all_rosters["team_name"] = all_rosters["year"].map(str) + (" " + all_rosters["team_name"])
# print(all_rosters)

        year player_id     last     first bats throws  team_id position  \
0       1884  berrc102    Berry   Charlie    R      R  ALT1884       2B   
1       1884  browj106    Brown       Jim    ?      ?  ALT1884       OF   
2       1884  carrp101  Carroll       Pat    ?      ?  ALT1884        C   
3       1884  connj102  Connors       Joe    ?      ?  ALT1884       3B   
4       1884  crosc101    Cross  Clarence    ?      ?  ALT1884       3B   
...      ...       ...      ...       ...  ...    ...      ...      ...   
110425  1884  wardj104     Ward      John    ?      ?  WSU1884       OF   
110426  1884  whitw102    White    Warren    ?      ?  WSU1884       3B   
110427  1884  wilej101    Wiley    Joseph    ?      ?  WSU1884       3B   
110428  1884  wiseb101     Wise      Bill    ?      ?  WSU1884        P   
110429  1884  yewee101   Yewell        Ed    ?      ?  WSU1884       3B   

             full_name                   team_name  
0        Charlie Berry  1884 Altoona Mountain 

In [52]:
players = all_rosters.groupby("player_id").agg(
    min_year=pd.NamedAgg("year", "min"),
    max_year=pd.NamedAgg("year", "max"),
    full_name=pd.NamedAgg("full_name", "first")
)
players["id"] = players.index
players["type"] = "player"
# print(players)

In [53]:
first_game_vertex = pd.DataFrame(
    {
        'id': 'first_game',
        'min_year': 1871,
        'max_year': 1871,
        'full_name': 'The First Professional Baseball Game',
        'type': 'metadata'
    },
    index=[list(range(1))]
)
# print(first_game_vertex)

In [54]:
vertices = pd.concat([first_game_vertex, teams, players], ignore_index=True)
# print(vertices)

In [55]:
first_game = pd.read_csv(first_game_path, index_col=None)
# print(first_game)

In [56]:
played_for_team = all_rosters.loc[:, ["year", "player_id", "team_id", "team_name"]]
played_for_team.rename(columns={"player_id":"src", "team_id":"dst"}, inplace=True)
edges = pd.concat([first_game, played_for_team.loc[:, ["src", "dst"]]], ignore_index=True)
# print(played_for_team)

In [57]:
g = Graph.DataFrame(edges, directed=False, vertices=vertices, use_vids=False)
bacon_numbers = g.distances("first_game")

In [58]:
vertices["distance"] = bacon_numbers[0]
player_distances = vertices[(vertices['type'] == 'player')]
player_distances["bacon_number"] = ((player_distances["distance"] - 1) / 2).map(int)
player_bacon_numbers = player_distances.loc[:, ["id", "full_name", "min_year", "max_year", "bacon_number"]]
# print(player_bacon_numbers.sort_values(by="id"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_distances["bacon_number"] = ((player_distances["distance"] - 1) / 2).map(int)


In [59]:
bacon_numbers_and_played_for = player_bacon_numbers.merge(
    played_for_team, "inner",
    left_on="id", right_on="src"
).loc[:, ["id", "full_name", "min_year", "max_year", "bacon_number", "dst", "team_name"]]
# print(bacon_numbers_and_played_for)

In [60]:
player_bacon_paths = bacon_numbers_and_played_for.merge(
    bacon_numbers_and_played_for,
    how="inner",
    on="dst"
)
player_bacon_paths = player_bacon_paths[(player_bacon_paths["id_x"] != player_bacon_paths["id_y"])]
player_bacon_paths = player_bacon_paths[((player_bacon_paths["bacon_number_x"] > player_bacon_paths["bacon_number_y"])
                                         | (player_bacon_paths["bacon_number_x"] == 0))]
player_bacon_paths = player_bacon_paths.groupby("id_x").agg(
    player=pd.NamedAgg("full_name_x", "first"),
    min_year=pd.NamedAgg("min_year_x", "first"),
    max_year=pd.NamedAgg("max_year_x", "first"),
    bacon_number=pd.NamedAgg("bacon_number_x", "first"),
    team=pd.NamedAgg("team_name_x", "first"),
    teammate=pd.NamedAgg("full_name_y", "first"),
    teammate_id=pd.NamedAgg("id_y", "first")
)
player_bacon_paths.loc[player_bacon_paths["bacon_number"] == 0, "teammate_id"] = ""
player_bacon_paths.loc[player_bacon_paths["bacon_number"] == 0, "teammate"] = ""
player_bacon_paths.loc[player_bacon_paths["bacon_number"] == 0, "team"] = ""

teammates = player_bacon_paths.loc[:, ["team", "teammate", "teammate_id"]]
teammates["player_id"] = teammates.index
teammates = teammates.reset_index(drop=True)
teammates.index.names = ["row"]

player_bacon_paths.index.names = ["id"]
player_bacon_paths.rename(columns={"team": "team_1", "teammate": "teammate_1", "teammate_id": "teammate_id_1"}, inplace=True)
# print(player_bacon_paths)

In [61]:
for i in range(1, MAX_BACON_NUMBER):
    player_bacon_paths = player_bacon_paths.merge(
        teammates,
        how="left",
        left_on=f"teammate_id_{i}",
        right_on="player_id"
    )
    player_bacon_paths.rename(columns={
        "teammate": f"teammate_{i + 1}",
        "team": f"team_{i + 1}",
        "teammate_id": f"teammate_id_{i + 1}"
    }, inplace=True)
    player_bacon_paths.drop(columns=["player_id", f"teammate_id_{i}"], inplace=True)
# print(player_bacon_paths)

In [62]:
player_bacon_paths.dropna(axis='columns', how='all').to_csv(bacon_numbers_path, index=False)