In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw24.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, birth_date, bonus, bps, can_select, can_transact, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goa

In [2]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
          web_name  selected_by_percent  expected_goals  expected_assists  \
1          G.Jesus            -0.138157        1.055446         -0.075662   
5         J.Timber             0.371759       -0.087048          0.383317   
11            Raya             3.781827       -0.459600         -0.553376   
15          Thomas            -0.201897        0.300407          0.926599   
23       Calafiori            -0.122223       -0.126787         -0.038195   
..             ...                  ...             ...               ...   
760        Sarabia            -0.313441        0.682893          0.205345   
761  Strand Larsen            -0.122223        2.590360         -0.047562   
762           Toti            -0.297506       -0.444698         -0.216166   
764          André            -0.329376       -0.424828         -0.356670   
770        Pereira            -0.297506       -0.459600         -0.562743   

         form  total_points

In [3]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
          web_name  selected_by_percent  expected_goals  expected_assists  \
2          Gabriel             4.148330        0.812045          0.402051   
3          Havertz             0.881676        4.204753          0.964067   
10        Ødegaard             0.451434        1.090217          2.556445   
11            Raya             3.781827       -0.459600         -0.553376   
12            Rice             0.100866        0.270602          3.212130   
..             ...                  ...             ...               ...   
672            Son             0.419564        2.123428          3.090359   
700          Bowen             0.116801        2.004212          3.886548   
737          Cunha             0.865741        2.168135          3.137194   
761  Strand Larsen            -0.122223        2.590360         -0.047562   
770        Pereira            -0.297506       -0.459600         -0.562743   

         form  total_points  

In [4]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 25 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])



🔥 Best Differentials for 2024-2025 FPL Gameweek 25 🔥
        web_name  selected_by_percent  expected_goals  expected_assists  \
473      Haaland             4.100525        8.546228          0.654958   
470        Foden             0.977285        1.815452          3.895915   
90      Kluivert             3.001017        3.881875          1.160772   
265       Mateta             0.706392        4.458089          0.898498   
502  B.Fernandes             1.072895        3.434812          3.577440   
96       Semenyo             0.897611        3.027488          2.406574   
544       Gordon             2.905408        3.275856          3.502504   
64       Watkins             2.379556        5.123715          0.739260   
672          Son             0.419564        2.123428          3.090359   
3        Havertz             0.881676        4.204753          0.964067   

         form  total_points  differential_score  
473  2.775482      3.765573           11.976668  
470  5.036782      1