In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw20.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, can_select, can_transact, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded,

In [2]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
      web_name  selected_by_percent  expected_goals  expected_assists  \
5     J.Timber             0.590807       -0.073327          0.417750   
15      Thomas            -0.214984        0.309101          0.642094   
19       White             0.015242       -0.404384         -0.244598   
23   Calafiori            -0.132761       -0.153238          0.001112   
25      Merino            -0.330097        0.697236          0.118625   
..         ...                  ...             ...               ...   
697   N.Semedo            -0.297208       -0.158946          0.332286   
700    R.Gomes            -0.330097       -0.267395         -0.127085   
701    S.Bueno            -0.330097       -0.221732         -0.500991   
704       Toti            -0.313652       -0.461463         -0.191183   
706      André            -0.346542       -0.438632         -0.415527   

         form  total_points  
5    1.179737      1.472032  
15   0.887107  

In [3]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
        web_name  selected_by_percent  expected_goals  expected_assists  \
1        G.Jesus             1.166373        1.262316         -0.030937   
2        Gabriel             4.422428        0.902720          0.396384   
3        Havertz             1.067704        3.956435          0.845072   
10      Ødegaard             0.689476        0.982630          2.159086   
11          Raya             4.701989       -0.478587         -0.575772   
..           ...                  ...             ...               ...   
617  Pedro Porro             3.583747        0.252022          1.902693   
624          Son             0.557918        2.192701          3.173975   
644        Bowen             0.261913        2.238364          4.028618   
649    Fabianski             2.481951       -0.478587         -0.586455   
679        Cunha             1.758383        1.838812          3.109877   

         form  total_points  
1    2.642884      0.68

In [5]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 21 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])



🔥 Best Differentials for 2024-2025 FPL Gameweek 21 🔥
        web_name  selected_by_percent  expected_goals  expected_assists  \
119       Mbeumo             4.965104        3.083130          4.327743   
438      Haaland             4.669099        8.505615          0.652777   
501       Gordon             2.695732        3.157332          3.366270   
644        Bowen             0.261913        2.238364          4.028618   
450        Sávio             0.163245        0.994046          4.178181   
616     Maddison             1.051260        2.072835          4.156815   
435        Foden             0.541473        1.108204          3.964520   
462  B.Fernandes             1.084149        3.077422          3.238073   
63       Watkins             2.481951        5.246416          0.802340   
624          Son             0.557918        2.192701          3.173975   

         form  total_points  differential_score  
119  3.988979      4.047521           11.399852  
438  2.233203      3