In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw3.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")

Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded, goals_conceded_per_90, go

In [9]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
          web_name  selected_by_percent  expected_goals  expected_assists  \
8       Martinelli            -0.014515       -0.155242          2.341325   
10        Ødegaard             1.386539        0.968006          2.988966   
38           Duran            -0.181307        1.706985          0.213361   
53          Ramsey            -0.314740        0.051672          2.757666   
57       Tielemans            -0.181307        0.140350          2.248805   
59           Onana             1.019596        2.889352         -0.202979   
69            Cook            -0.364778        0.169909          2.017504   
78         O.Dango            -0.381457        1.027124          3.220266   
82         Semenyo             0.435824        3.184943          5.579530   
84      Sinisterra            -0.314740        1.263598          0.999783   
86       Tavernier            -0.331420        5.283644          1.323603   
103         Jensen          

In [12]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
      web_name  selected_by_percent  expected_goals  expected_assists  \
3      Havertz             2.620800        3.835245          0.120841   
10    Ødegaard             1.386539        0.968006          2.988966   
11        Raya             4.355437       -0.391715         -0.388020   
13        Saka             5.022605        3.598772          2.665146   
14      Saliba             6.006679       -0.391715         -0.341760   
..         ...                  ...             ...               ...   
579        Son             1.153030        3.362298          2.017504   
594      Bowen             0.602616        0.140350          2.063765   
599  Fabianski             2.003669       -0.391715         -0.434280   
608     Souček             0.168957        3.687449          0.953523   
626      Cunha             0.052202        3.244062          2.156285   

         form  total_points  
3    3.270255      3.296115  
10   0.574809    

In [14]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new])

# Drop duplicates based on a unique identifier after concatenation
final_differentials = final_differentials.drop_duplicates(subset='web_name')

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) # xG + xA > 5.0
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 4 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])



🔥 Best Differentials for 2024-2025 FPL Gameweek 4 🔥
             web_name  selected_by_percent  expected_goals  expected_assists  \
381         Luis Díaz             2.203820        5.313203          2.156285   
399         De Bruyne             1.453255        3.007588          7.013593   
82            Semenyo             0.435824        3.184943          5.579530   
107            Mbeumo             0.919521        2.357287          3.775387   
366  Alexander-Arnold             4.672342        0.081231          6.689772   
118             Wissa             0.902842        5.786150          0.814742   
571          Maddison             0.385786        1.086243          6.180911   
579               Son             1.153030        3.362298          2.017504   
86          Tavernier            -0.331420        5.283644          1.323603   
196         N.Jackson             0.602616        5.460999         -0.110459   

         form  total_points  differential_score  
381  5.103158   