In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw2.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded, goals_conceded_per_90, go

In [4]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
         web_name  selected_by_percent  expected_goals  expected_assists  \
0    Fábio Vieira            -0.392484       -0.377167         -0.387639   
1         G.Jesus            -0.146150       -0.377167         -0.387639   
4            Hein            -0.376062       -0.377167         -0.387639   
6        Jorginho            -0.343218       -0.377167         -0.387639   
7          Kiwior            -0.359640       -0.377167         -0.387639   
..            ...                  ...             ...               ...   
619      N.Semedo            -0.392484       -0.377167         -0.387639   
620    Pedro Lima            -0.146150       -0.377167         -0.387639   
621       Podence            -0.376062       -0.210041         -0.325478   
623       S.Bueno            -0.376062       -0.377167         -0.387639   
624       Sarabia            -0.359640        0.040649         -0.263317   

         form  total_points  
0   -0.65

In [5]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
       web_name  selected_by_percent  expected_goals  expected_assists  \
3       Havertz             2.415724        2.380416          0.047488   
8    Martinelli             0.050917       -0.042914          3.342026   
12     Ødegaard             1.824522        0.792717          3.714992   
14         Raya             3.992262       -0.377167         -0.387639   
16         Saka             5.223933        1.294095          2.533932   
..          ...                  ...             ...               ...   
580       Kudus             0.822764        0.291338          3.528509   
582   L.Paquetá            -0.014772        3.466736         -0.014673   
585      Souček             0.083761        4.845528          0.979905   
601       Cunha            -0.014772        3.759207          0.979905   
618    Mosquera            -0.244684        0.667372         -0.263317   

         form  total_points  
3    2.767939      2.781561  
8    

In [7]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 3 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])


🔥 Best Differentials for 2024-2025 FPL Gameweek 3 🔥
             web_name  selected_by_percent  expected_goals  expected_assists  \
79            Semenyo            -0.064039        2.589324          7.506818   
347  Alexander-Arnold             4.632731        0.291338          9.123007   
379         De Bruyne             1.364699        3.299610          5.641986   
362         Luis Díaz             0.986986        5.179780          3.031220   
547          Maddison             0.477896        1.419440          8.066268   
555               Son             1.249743        4.511275          1.787999   
142            Mitoma             0.592852        3.675644          3.466348   
403       B.Fernandes             2.300768        4.887309          4.336603   
154           Welbeck             0.921297        5.096217         -0.014673   
352          Diogo J.             2.859125        6.057193          0.233972   

         form  total_points  differential_score  
79   2.523469   