In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw1.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded, goals_conceded_per_90, go

In [2]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
         web_name  selected_by_percent  expected_goals  expected_assists  \
0    Fábio Vieira            -0.387445       -0.305782         -0.319426   
1         G.Jesus            -0.037355       -0.305782         -0.319426   
4            Hein            -0.371531       -0.305782         -0.319426   
5        J.Timber            -0.244226       -0.159822         -0.210716   
6        Jorginho            -0.339705       -0.305782         -0.319426   
..            ...                  ...             ...               ...   
608      N.Semedo            -0.387445       -0.305782         -0.319426   
609    Pedro Lima            -0.148747       -0.305782         -0.319426   
610       Podence            -0.371531       -0.013861         -0.319426   
612       S.Bueno            -0.371531       -0.305782         -0.319426   
613       Sarabia            -0.355618        0.278059         -0.210716   

         form  total_points  
0   -0.58

In [3]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
        web_name  selected_by_percent  expected_goals  expected_assists  \
3        Havertz             1.967706        2.978325          0.115417   
12      Ødegaard             2.079098       -0.159822          0.332839   
14          Raya             3.606764       -0.305782         -0.319426   
16          Saka             4.863905        2.248523          3.702875   
17        Saliba             6.009654       -0.305782         -0.319426   
..           ...                  ...             ...               ...   
536  Pedro Porro             5.150342        0.351039          3.159321   
544          Son             1.172047       -0.086842          2.941899   
569        Kudus             0.981089        0.278059          4.789984   
571    L.Paquetá            -0.069181        5.459651          0.115417   
574       Souček            -0.005528        7.722035          0.985104   

         form  total_points  
3    4.717458      4.71

In [5]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 2 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])


🔥 Best Differentials for 2024-2025 FPL Gameweek 2 🔥
             web_name  selected_by_percent  expected_goals  expected_assists  \
339  Alexander-Arnold             4.434249        0.132099         10.551659   
535          Maddison             0.344562       -0.159822         10.334238   
114             Wissa             0.137690        6.992234         -0.210716   
16               Saka             4.863905        2.248523          3.702875   
574            Souček            -0.005528        7.722035          0.985104   
141            Mitoma             0.121777        6.773293          0.115417   
344          Diogo J.             2.461015        6.627333          0.115417   
394       B.Fernandes             2.683799        8.378857          0.006706   
335          A.Fatawu            -0.323792       -0.305782          6.203225   
331             Vardy            -0.116921        5.532631         -0.319426   

         form  total_points  differential_score  
339  2.951189   