In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_pregw1.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded, goals_conceded_per_90, go

In [20]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
          web_name  selected_by_percent  expected_goals  expected_assists  \
0     Fábio Vieira            -0.382977       -0.162670         -0.272152   
4             Hein            -0.351956       -0.532800         -0.628112   
5         J.Timber            -0.103791       -0.532800         -0.616989   
9           Nelson            -0.382977       -0.479501         -0.388951   
11         Nwaneri            -0.367466       -0.532800         -0.600303   
..             ...                  ...             ...               ...   
577     Pedro Lima            -0.196853       -0.532800         -0.628112   
578        Podence            -0.367466       -0.532800         -0.628112   
579        R.Gomes            -0.367466       -0.532800         -0.628112   
580        S.Bueno            -0.367466       -0.532800         -0.622551   
582  Strand Larsen            -0.367466       -0.532800         -0.628112   

     form  total_points  
0

In [21]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
             web_name  selected_by_percent  expected_goals  expected_assists  \
2             Gabriel             1.447238        0.793747         -0.266590   
3             Havertz             1.803974        3.127049          1.529900   
12           Ødegaard             2.238263        1.696865          5.022766   
14               Raya             3.137859       -0.532800         -0.605865   
15               Rice             0.470089        0.512448          2.202889   
16               Saka             3.711740        4.027206          5.434346   
17             Saliba             5.960732       -0.029423         -0.127542   
23              White             1.462748       -0.156748          1.407539   
28             Bailey             0.314986        1.329696          3.359761   
34              Diaby            -0.305425        1.658371          1.730128   
44              Konsa             2.362345        0.225227         -0.0663

In [24]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 1 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])


🔥 Best Differentials for 2024-2025 FPL Gameweek 1 🔥
        web_name  selected_by_percent  expected_goals  expected_assists  form  \
16          Saka             3.711740        4.027206          5.434346   0.0   
338      M.Salah             4.937053        5.652818          3.526618   0.0   
377  B.Fernandes             1.803974        2.348294          5.061699   0.0   
12      Ødegaard             2.238263        1.696865          5.022766   0.0   
519          Son             1.385197        2.869438          3.749094   0.0   
362    J.Alvarez             0.097842        3.337283          2.864753   0.0   
326       Darwin             0.826826        4.272972          1.802433   0.0   
183    N.Jackson             0.113353        4.838531          1.040454   0.0   
84       Solanke             1.199073        5.282688          0.545446   0.0   
358        Foden             4.254600        2.478580          3.237400   0.0   

     total_points  differential_score  
16       3.5173