### Similarity-Based Player Selection per Tactical Line Using Cosine Matching and Contribution Features


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_best_players_per_line(
    player_file: str,
    team_file: str,
    needs_file: str,
    target_squad: str,
    max_age: int = 30,
    top_n: int = 10
):
    # === Load data ===
    players_df = pd.read_csv(player_file)
    teams_df = pd.read_csv(team_file)
    needs_df = pd.read_csv(needs_file)

    # === Clean column names
    players_df.columns = players_df.columns.str.strip()
    teams_df.columns = teams_df.columns.str.strip()
    needs_df.columns = needs_df.columns.str.strip()

    # Define ALL contribution features that might appear across any role
    all_contribution_features = [
        "Tkl_Contribution", "TklW_Contribution", "Int_Contribution", "Blocks_Contribution",
        "Clr_Contribution", "Def 3rd_Contribution", "Mid 3rd_Contribution", "Att 3rd_Contribution",
        "PPA_Contribution", "PrgP_Contribution", "xAG_Contribution", "Ast_Contribution",
        "KP_Contribution", "Sh_Contribution", "SoT_Contribution", "Gls_Contribution",
        "xG_Contribution"
    ]

    # === Feature → Line mapping (for similarity calculation)
    feature_type_map = {
        "Tkl_Contribution": "def", "TklW_Contribution": "def", "Int_Contribution": "def",
        "Blocks_Contribution": "def", "Clr_Contribution": "def", "Def 3rd_Contribution": "def",
        "Mid 3rd_Contribution": "def", "Att 3rd_Contribution": "def",
        "PPA_Contribution": "mid", "PrgP_Contribution": "mid",
        "xAG_Contribution": "att", "Ast_Contribution": "att", "KP_Contribution": "att",
        "Sh_Contribution": "att", "SoT_Contribution": "att", "Gls_Contribution": "att",
        "xG_Contribution": "att"
    }

    # === Line label → Position keyword
    line_position_keyword = {
        "DEF": "DF",
        "MID": "MF",
        "ATT": "FW"
    }

    # === Group features by line (for similarity calculation)
    line_features = {
        "DEF": [f for f, l in feature_type_map.items() if l == "def"],
        "MID": [f for f, l in feature_type_map.items() if l == "mid"],
        "ATT": [f for f, l in feature_type_map.items() if l == "att"]
    }

    # === Output container
    top_players_per_line = {}

    for line, features in line_features.items():
        pos_keyword = line_position_keyword[line]

        # Team profile for that line
        team_players = players_df[
            (players_df["Squad"] == target_squad) &
            (players_df["Age"] <= max_age) &
            (players_df["Pos"].str.contains(pos_keyword, na=False))
        ].copy()

        if team_players.empty:
            # If no team players for this line, return empty DataFrame with all expected columns
            top_players_per_line[line] = pd.DataFrame(columns=["Player", "Age", "Squad", "Pos", "MarketValue", "Similarity","Role"] + all_contribution_features)
            continue

        # Ensure features are present in team_players before calculating mean
        for f in features: # Only iterate through features relevant to this line for team_vector
            if f not in team_players.columns:
                team_players[f] = 0.0
            else:
                team_players[f] = pd.to_numeric(team_players[f], errors='coerce').fillna(0) # Ensure numeric and fill NaNs

        team_vector = team_players[features].mean().values.reshape(1, -1)

        # Candidate pool (other squads, with matching position)
        candidates_df = players_df[
            (players_df["Squad"] != target_squad) &
            (players_df["Age"] <= max_age) &
            (players_df["Pos"].str.contains(pos_keyword, na=False))
        ].copy()

        if candidates_df.empty:
            # If no candidates match, return empty DataFrame with all expected columns
            top_players_per_line[line] = pd.DataFrame(columns=["Player", "Age", "Squad", "Pos", "MarketValue", "Similarity"] + all_contribution_features)
            continue

        # --- IMPORTANT: Ensure ALL_CONTRIBUTION_FEATURES are present in candidates_df ---
        # before calculating similarity and selecting top_candidates.
        # This prepares the DataFrame for the final selection step.
        for f in all_contribution_features:
            if f not in candidates_df.columns:
                candidates_df[f] = 0.0 # Add missing columns with 0
            else:
                candidates_df[f] = pd.to_numeric(candidates_df[f], errors='coerce').fillna(0) # Ensure numeric and fill NaNs

        # Cosine similarity (still using only 'features' relevant to the line for similarity calculation)
        feature_matrix = candidates_df[features].values

        # Handle cases where team_vector or feature_matrix might lead to zero norm or NaNs
        if np.linalg.norm(team_vector) == 0 or np.linalg.norm(feature_matrix, axis=1).sum() == 0:
            similarity_scores = np.zeros(len(candidates_df))
        else:
            similarity_scores = cosine_similarity(feature_matrix, team_vector).flatten()

        candidates_df["Similarity"] = similarity_scores

        # Get top N candidates
        top_candidates = candidates_df.sort_values(by="Similarity", ascending=False).head(top_n)

        # --- THE FIX IS HERE: Select ALL_CONTRIBUTION_FEATURES for the output ---
        # Define all columns to be returned, including ALL contribution features.
        output_cols = ["Player", "Age", "Squad", "Pos", "MarketValue", "Similarity","Role"] + all_contribution_features

        # Ensure all output_cols are present in top_candidates before selecting
        for col in output_cols:
            if col not in top_candidates.columns:
                top_candidates[col] = 0.0 # Fill with 0 if somehow still missing

        top_players_per_line[line] = top_candidates[output_cols].copy()

    return top_players_per_line

### Role-Based Tactical Needs Ranking Using Cosine Similarity Between Team Deficiencies and Role Profiles


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def rank_team_role_needs(
    target_team: str,
    needs_file: str = "teams_needs_2023_2024.csv",
    roles_files: dict = None,
    needs_rename_map: dict = None,
    top_n: int = 10
) -> pd.DataFrame:
    if roles_files is None:
        roles_files = {
            "DEF": ("role_vectors_def.csv", [
                "Tkl_Contribution", "TklW_Contribution", "Int_Contribution", "Blocks_Contribution", "Clr_Contribution",
                "Def 3rd_Contribution", "Mid 3rd_Contribution", "Att 3rd_Contribution", "xAG_Contribution",
                "KP_Contribution", "PPA_Contribution", "PrgP_Contribution"
            ]),
            "MID": ("role_vectors_mf.csv", [
                "xAG_Contribution", "Ast_Contribution", "KP_Contribution", "PPA_Contribution", "PrgP_Contribution",
                "Tkl_Contribution", "TklW_Contribution", "Int_Contribution", "Mid 3rd_Contribution", "Att 3rd_Contribution"
            ]),
            "ATT": ("role_vectors_fw.csv", [
                "Gls_Contribution", "Sh_Contribution", "SoT_Contribution", "xG_Contribution", "xAG_Contribution",
                "Ast_Contribution", "KP_Contribution", "PPA_Contribution", "PrgP_Contribution",
                "Att 3rd_Contribution", "Mid 3rd_Contribution"
            ])
        }

    if needs_rename_map is None:
        needs_rename_map = {
            "standard_Gls": "Gls_Contribution", "shooting_Sh": "Sh_Contribution", "shooting_SoT": "SoT_Contribution",
            "shooting_xG": "xG_Contribution", "passing_xAG": "xAG_Contribution", "passing_Ast": "Ast_Contribution",
            "passing_KP": "KP_Contribution", "passing_PPA": "PPA_Contribution", "passing_PrgP": "PrgP_Contribution",
            "defense_Blocks": "Blocks_Contribution", "defense_TklW": "TklW_Contribution", "defense_Def 3rd": "Def 3rd_Contribution",
            "defense_Mid 3rd": "Mid 3rd_Contribution", "defense_Att 3rd": "Att 3rd_Contribution",
            "defense_Tkl": "Tkl_Contribution", "defense_Int": "Int_Contribution", "defense_Clr": "Clr_Contribution"
        }

    # Load and clean team needs
    team_needs_df = pd.read_csv(needs_file)
    team_needs_df.rename(columns=needs_rename_map, inplace=True)

    # Select team row
    team_row = team_needs_df[team_needs_df["Squad"] == target_team]
    if team_row.empty:
        raise ValueError(f"Team '{target_team}' not found in team needs file.")

    # Extract team total need weights
    total_needs = {
        "DEF": float(team_row["Team_Def_Score"].iloc[0]),
        "MID": float(team_row["Team_Mid_Score"].iloc[0]),
        "ATT": float(team_row["Team_Att_Score"].iloc[0])
    }

    # Process each role file
    all_results = []
    for position, (file, features) in roles_files.items():
        role_df = pd.read_csv(file)

        missing_feats = [f for f in features if f not in team_row.columns]
        if missing_feats:
            raise ValueError(f"Missing features in team needs for {position}: {missing_feats}")

        team_vector = team_row[features].values.reshape(1, -1)
        role_vectors = role_df[features].values

        cos_sims = cosine_similarity(team_vector, role_vectors)[0]
        weighted_sims = cos_sims * total_needs[position]

        result_df = role_df.copy()
        result_df["Cosine_Similarity"] = cos_sims
        result_df["Weighted_Similarity"] = weighted_sims
        result_df["Position"] = position

        all_results.append(result_df[["Role", "Position", "Cosine_Similarity", "Weighted_Similarity"]])

    # Combine and sort
    final_df = pd.concat(all_results, ignore_index=True)
    final_df_sorted = final_df.sort_values(by="Weighted_Similarity", ascending=False)

    return final_df_sorted.head(top_n)


### Optimization Setup: Setting the players pool


In [None]:
import pandas as pd
import numpy as np
from pyomo.environ import *
from sklearn.metrics.pairwise import cosine_similarity

# --- Global Parameters ---
player_file = "players_with_roles_all_unique.csv"
team_file = "teams_2023_2024.csv"
needs_file = "teams_needs_2023_2024.csv"
target_squad = "Almería"
max_age = 30
max_players = 5
budget = 40000000
top_n_candidates = 200

# --- Load and filter the player pool ---
player_pool = get_best_players_per_line(
    player_file=player_file,
    team_file=team_file,
    needs_file=needs_file,
    target_squad=target_squad,
    max_age=max_age,
    top_n=top_n_candidates
)


### Step 2: Identify Most Needed Roles and Allocate Recruitment Quotas

This step ranks the most tactically needed player roles for the target squad (`Almería`) using cosine similarity between team needs and predefined role vectors. Based on this ranking, the total number of players to recruit (`max_players`) is proportionally allocated across roles. Quotas are determined by normalizing similarity weights, assigning integer portions first, and distributing the remaining players to roles with the largest fractional needs.


In [None]:
# --- Step 2: Get Ranked Role Needs and Allocate Quotas ---

# Get top needed roles (you can set top_n=max_players to ensure they match)
ranked_roles_df = rank_team_role_needs(
    target_team=target_squad,
    needs_file=needs_file,
    top_n=max_players
)

# Normalize Weighted Similarity scores to sum to 1 (for proportional allocation)
ranked_roles_df["Normalized_Weight"] = ranked_roles_df["Weighted_Similarity"] / ranked_roles_df["Weighted_Similarity"].sum()

# Allocate number of players to each role based on normalized weights
# Start with floor and then distribute remaining players one by one
role_allocations = {}
remaining_players = max_players

# First assign floor portion
for _, row in ranked_roles_df.iterrows():
    role = row["Role"]
    allocated = int(np.floor(row["Normalized_Weight"] * max_players))
    role_allocations[role] = allocated
    remaining_players -= allocated

# Distribute remaining players to top roles with highest fractional remainder
ranked_roles_df["Fractional_Extra"] = (ranked_roles_df["Normalized_Weight"] * max_players) % 1
extra_roles = ranked_roles_df.sort_values(by="Fractional_Extra", ascending=False)

for _, row in extra_roles.iterrows():
    if remaining_players <= 0:
        break
    role = row["Role"]
    role_allocations[role] += 1
    remaining_players -= 1

# Final quotas
min_role_quota = role_allocations.copy()
max_role_quota = role_allocations.copy()

# --- Output ---
print("Role-Based Quotas based on Team Needs:")
for role in role_allocations:
    print(f"{role}: min = {min_role_quota[role]}, max = {max_role_quota[role]}")


Role-Based Quotas based on Team Needs:
Progressive Defender: min = 1, max = 1
Traditional Balanced Defender: min = 1, max = 1
Ball Winning Defender: min = 1, max = 1
Playmaker Midfielder: min = 1, max = 1
Box-to-Box Midfielder: min = 1, max = 1


### Step 3: Group Features by Tactical Line and Structure Role Quotas

This step organizes contribution features into tactical lines (Defense, Midfield, Attack) based on a predefined mapping (`feature_type_map`). Each line is also linked to a position keyword (e.g., `"DF"` for defenders). These groupings are used for both role identification and optimization.

Additionally, the role quotas computed earlier (`min_role_quota` and `max_role_quota`) are compiled into a reference table, `role_allocations_df`, which includes each role’s tactical line, weighted importance, normalized weight, and the assigned minimum and maximum player counts. This structured format helps validate the alignment between team needs and recruitment targets.


In [None]:
# --- Feature to Line Map (used for grouping features) ---
feature_type_map = {
    "Tkl_Contribution": "def", "TklW_Contribution": "def", "Int_Contribution": "def",
    "Blocks_Contribution": "def", "Clr_Contribution": "def", "Def 3rd_Contribution": "def",
    "Mid 3rd_Contribution": "def", "Att 3rd_Contribution": "def",
    "PPA_Contribution": "mid", "PrgP_Contribution": "mid",
    "xAG_Contribution": "att", "Ast_Contribution": "att", "KP_Contribution": "att",
    "Sh_Contribution": "att", "SoT_Contribution": "att", "Gls_Contribution": "att",
    "xG_Contribution": "att"
}

# --- Line → Position Keywords ---
line_position_keyword = {
    "DEF": "DF",
    "MID": "MF",
    "ATT": "FW"
}

# --- Features Grouped by Line ---
line_features = {
    "DEF": [f for f, l in feature_type_map.items() if l == "def"],
    "MID": [f for f, l in feature_type_map.items() if l == "mid"],
    "ATT": [f for f, l in feature_type_map.items() if l == "att"]
}

# --- Role-Based Quotas (from previous step) ---
# Already computed:
# - min_role_quota: dictionary {role: min number of players to select}
# - max_role_quota: dictionary {role: max number of players to select}

# --- Optional: Create DataFrame to store for reference/debugging ---
role_allocations_df = ranked_roles_df[["Role", "Position", "Weighted_Similarity", "Normalized_Weight"]].copy()
role_allocations_df["Min_Quota"] = role_allocations_df["Role"].map(min_role_quota)
role_allocations_df["Max_Quota"] = role_allocations_df["Role"].map(max_role_quota)

# Preview
print("\nRanked Role Allocation Table:")
print(role_allocations_df)



Ranked Role Allocation Table:
                            Role Position  Weighted_Similarity  \
2           Progressive Defender      DEF            88.952773   
0  Traditional Balanced Defender      DEF            86.059104   
1          Ball Winning Defender      DEF            80.961219   
5           Playmaker Midfielder      MID            80.896506   
3          Box-to-Box Midfielder      MID            80.523710   

   Normalized_Weight  Min_Quota  Max_Quota  
2           0.213115          1          1  
0           0.206182          1          1  
1           0.193969          1          1  
5           0.193814          1          1  
3           0.192920          1          1  


### Step 4: Pyomo Optimization Model for Role-Aware Player Selection

This section formulates a Pyomo-based optimization model to select players that best fit a team's tactical needs while adhering to role quotas, budget limits, and positional diversity.

- **Decision Variables:** Binary variable `x[i]` indicates whether player `i` is selected.
- **Objective Function:** Maximizes the total weighted contribution of selected players, adjusted by:
  - Player's tactical role importance (`n_j`)
  - Player-to-role similarity (used as a proxy for team strength)
  - Line-specific need scores (`N_l`)
  - Contribution features per tactical line
- **Constraints:**
  - Squad size must equal the specified maximum number of players.
  - Total cost must not exceed the transfer budget.
  - Role-specific quotas must be met (both minimum and maximum per tactical role).

The model inputs include:
- A merged and filtered player pool
- Role-based tactical needs (`ranked_roles_df`)
- Feature contributions per player
- Budget and age restrictions


In [None]:
from pyomo.environ import *

# --- Combine all players into a single DataFrame for modeling ---
all_players_df = pd.concat(player_pool.values(), ignore_index=True)
all_players_df = all_players_df.drop_duplicates(subset=["Player", "Squad", "Role"]).reset_index(drop=True)
all_players_df["ID"] = all_players_df.index.astype(str)  # unique string ID per player

# --- Pyomo Model ---
model = ConcreteModel()

# === Sets ===
P = all_players_df["ID"].tolist()
R = list(min_role_quota.keys())  # Specific tactical roles
F = [f for f in feature_type_map.keys() if f in all_players_df.columns]  # Contribution features

model.P = Set(initialize=P)
model.R = Set(initialize=R)
model.F = Set(initialize=F)

# === Parameters ===
# Create lookup dictionaries
cost = dict(zip(all_players_df["ID"], all_players_df["MarketValue"]))
role = dict(zip(all_players_df["ID"], all_players_df["Role"]))
broad_line = dict(zip(all_players_df["ID"], all_players_df["Pos"].map(lambda x: next((k for k,v in line_position_keyword.items() if v in x), "UNKNOWN"))))
team_strength = dict(zip(all_players_df["ID"], all_players_df["Similarity"]))  # Use similarity as proxy for source team strength

# Feature contributions: f_ij[player_id][feature_name]
f_ij = {
    pid: {feat: all_players_df.loc[all_players_df["ID"] == pid, feat].values[0] for feat in F}
    for pid in P
}

# Feature needs: use from ranked_roles_df
n_j = ranked_roles_df.set_index("Role").to_dict()["Normalized_Weight"]

# Broad line team need scores (optional, fallback = 1 if not found)
N_l = {"DEF": 1, "MID": 1, "ATT": 1}  # Customize if you have line-wise team scores

# === Decision Variables ===
model.x = Var(model.P, domain=Binary)

# === Objective Function ===
def objective_rule(m):
    total = 0
    for i in m.P:
        l_i = broad_line[i]
        relevant_feats = [j for j in m.F if feature_type_map.get(j, '')[:3].upper() == l_i[:3].upper()]
        if not relevant_feats:
            continue
        contribution_sum = sum(f_ij[i][j] * n_j.get(role[i], 0) for j in relevant_feats)
        total += m.x[i] * team_strength[i] * N_l.get(l_i, 1) * (contribution_sum / len(relevant_feats))
    return total

model.objective = Objective(rule=objective_rule, sense=maximize)

# === Constraints ===

# Squad size
model.squad_size = Constraint(expr=sum(model.x[i] for i in P) == max_players)

# Budget constraint
model.budget = Constraint(expr=sum(model.x[i] * cost[i] for i in P) <= budget)

# Minimum quota per role
def min_role_constr(m, r):
    return sum(m.x[i] for i in P if role[i] == r) >= min_role_quota[r]
model.min_quota = Constraint(model.R, rule=min_role_constr)

# Maximum quota per role
def max_role_constr(m, r):
    return sum(m.x[i] for i in P if role[i] == r) <= max_role_quota[r]
model.max_quota = Constraint(model.R, rule=max_role_constr)


In [None]:
solver = SolverFactory("gurobi")
results = solver.solve(model, tee=True)


Set parameter Username
Academic license - for non-commercial use only - expires 2025-10-30
Read LP format model from file C:\Users\aliha\AppData\Local\Temp\tmp9d7xg9pn.pyomo.lp
Reading time = 0.01 seconds
x1: 12 rows, 582 columns, 1979 nonzeros
Gurobi Optimizer version 11.0.3 build v11.0.3rc0 (win64 - Windows 11+.0 (26100.2))

CPU model: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz, instruction set [SSE2|AVX|AVX2|AVX512]
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 12 rows, 582 columns and 1979 nonzeros
Model fingerprint: 0xe69fee38
Variable types: 0 continuous, 582 integer (582 binary)
Coefficient statistics:
  Matrix range     [1e+00, 2e+08]
  Objective range  [4e+00, 2e+01]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 4e+07]
         Consider reformulating model or setting NumericFocus parameter
         to avoid numerical issues.
Found heuristic solution: objective 26.9967304
Presolve removed 7 rows and 553 col

### Step 5: Extracting and Displaying the Optimized Player Selection

After solving the optimization model, this step identifies and presents the selected players based on the optimized decision variables:

- **Output Includes:**
  - Player name
  - Assigned tactical role
  - Position
  - Age
  - Market value
  - Current squad
- **Summary Statistics:**
  - Total market value of selected players
  - Final value of the optimization objective (reflecting fit and contribution)

This provides a clear view of the recommended transfers aligned with team needs and budget constraints.


In [None]:
selected_ids = [i for i in P if value(model.x[i]) == 1]
selected_players = all_players_df[all_players_df["ID"].isin(selected_ids)]

print("✅ Selected Players:")
print(selected_players[["Player", "Role", "Pos", "Age", "MarketValue", "Squad"]])

print(f"\nTotal Market Value: €{selected_players['MarketValue'].sum():,.2f}")
print(f"Objective Score: {value(model.objective):.4f}")


✅ Selected Players:
                Player                           Role Pos   Age  MarketValue  \
84          David Raum           Progressive Defender  DF  25.0   20000000.0   
128       Jorge Cuenca          Ball Winning Defender  DF  23.0    6000000.0   
173  Ricardo Rodríguez  Traditional Balanced Defender  DF  30.0    3500000.0   
208     Julien Ponceau          Box-to-Box Midfielder  MF  22.0    3500000.0   
366       Kevin Stöger           Playmaker Midfielder  MF  29.0    5000000.0   

          Squad  
84   RB Leipzig  
128  Villarreal  
173      Torino  
208     Lorient  
366      Bochum  

Total Market Value: €38,000,000.00
Objective Score: 59.7387
