In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw4.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded, goals_conceded_per_90, go

In [3]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
     web_name  selected_by_percent  expected_goals  expected_assists  \
5    J.Timber            -0.195275       -0.240273         -0.271337   
12       Rice             0.120662        0.045797          0.246313   
18   Trossard            -0.078877        0.212671          0.024463   
21       Neto            -0.095506       -0.407147         -0.456212   
31     Bailey            -0.128762        0.188832          0.431188   
..        ...                  ...             ...               ...   
632   Doherty            -0.344930       -0.216434         -0.160412   
639  Hee Chan            -0.278417       -0.407147          0.098413   
642   J.Gomes            -0.361558       -0.097238         -0.197387   
647  Mosquera            -0.195275        0.379546         -0.382262   
655      Toti            -0.344930       -0.407147          0.431188   

         form  total_points  
5    1.133299      1.173179  
12  -0.248892     -0.272996

In [4]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
      web_name  selected_by_percent  expected_goals  expected_assists  \
2      Gabriel             1.667095        0.808651         -0.271337   
3      Havertz             2.515139        3.335604         -0.012512   
10    Ødegaard             0.719282        0.689455          2.279940   
11        Raya             4.244483       -0.407147         -0.419237   
13        Saka             4.610306        2.930338          3.315241   
..         ...                  ...             ...               ...   
598      Bowen             0.636140        0.689455          1.873215   
603  Fabianski             2.165945       -0.407147         -0.456212   
607      Kudus             0.536370        0.308028          2.242965   
612     Souček             0.153919        3.049533          0.653039   
630      Cunha             0.004264        2.954177          1.688340   

         form  total_points  
2    3.436953      3.487058  
3    2.745857    

In [None]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 5 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])



🔥 Best Differentials for 2024-2025 FPL Gameweek 3 🔥
             web_name  selected_by_percent  expected_goals  expected_assists  \
403         De Bruyne             1.450927        2.477393          7.567370   
259            McNeil            -0.211904       -0.144916          8.491746   
385         Luis Díaz             3.213528        4.456045          1.725315   
13               Saka             4.610306        2.930338          3.315241   
111            Mbeumo             1.051848        2.262841          4.202642   
86            Semenyo             0.636140        2.715785          4.461467   
122             Wissa             0.952078        5.385773          0.542113   
370  Alexander-Arnold             4.892987        0.021958          6.458119   
61            Watkins             3.579351        6.172466         -0.160412   
575          Maddison             0.320202        0.784812          6.125344   

         form  total_points  differential_score  
403  1.881987   