In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw23.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, can_select, can_transact, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded,

In [6]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
          web_name  selected_by_percent  expected_goals  expected_assists  \
1          G.Jesus            -0.112276        1.102373         -0.065861   
5         J.Timber             0.376087       -0.099882          0.409541   
15          Thomas            -0.222551        0.295732          0.952859   
23       Calafiori            -0.143783       -0.120434         -0.027053   
25          Merino            -0.317073        0.686208          0.108776   
..             ...                  ...             ...               ...   
740       N.Semedo            -0.301319       -0.166674          0.380435   
744        S.Bueno            -0.317073       -0.233466         -0.492754   
746  Strand Larsen            -0.049261        2.689967         -0.036755   
747           Toti            -0.301319       -0.449256         -0.211393   
749          André            -0.332827       -0.428704         -0.356924   

         form  total_points

In [7]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
          web_name  selected_by_percent  expected_goals  expected_assists  \
2          Gabriel             4.093943        0.824930          0.409541   
3          Havertz             0.895956        4.108012          0.768519   
8       Martinelli             0.218550        1.523677          2.243237   
10        Ødegaard             0.439101        1.066408          2.621619   
11            Raya             3.999421       -0.464669         -0.560668   
..             ...                  ...             ...               ...   
662            Son             0.439101        2.186458          3.067915   
686          Bowen             0.092521        1.980945          3.620935   
712         Potter            -0.238305       -0.464669         -0.570370   
722          Cunha             0.927464        2.022047          3.029107   
746  Strand Larsen            -0.049261        2.689967         -0.036755   

         form  total_points  

In [8]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 24 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])



🔥 Best Differentials for 2024-2025 FPL Gameweek 24 🔥
             web_name  selected_by_percent  expected_goals  expected_assists  \
464           Haaland             4.156957        8.665280          0.661796   
461             Foden             1.006232        1.888463          3.989615   
130            Mbeumo             4.960392        3.604503          4.387400   
532            Gordon             3.621334        3.368163          3.455999   
492       B.Fernandes             1.179522        3.465782          3.465701   
65            Watkins             2.345290        5.310268          0.778221   
94            Semenyo             0.706913        2.782448          2.437279   
419  Alexander-Arnold             4.818609        0.013150          5.444929   
324             Iwobi             0.754174        1.056133          4.154550   
141             Wissa             1.321304        4.827310          0.574477   

         form  total_points  differential_score  
464  3.439075  