In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load 2024-25 GW1 data
df_new = pd.read_csv("players_raw_24-25_postgw5.csv")

# Select relevant features (same as before)
numeric_columns = ['now_cost', 'expected_goals', 'expected_assists', 'total_points', 'form', 'minutes', 'selected_by_percent']

# Ensure all selected columns are numeric and check for coercion
df_new[numeric_columns] = df_new[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Check how many NaN values exist in each column after coercion
print("Missing values per column:")
print(df_new[numeric_columns].isnull().sum())

# Inspect rows that have NaN values
print("\nRows with missing values:")
print(df_new[df_new[numeric_columns].isnull().any(axis=1)])

# Drop rows with missing values after conversion
df_new = df_new.dropna(subset=numeric_columns)

# Check if we still have data left
print("\nData shape after dropping rows with missing values:")
print(df_new[numeric_columns].shape)

# Scale the new data using the same scaler if there are rows left
if df_new.shape[0] > 0:
    scaler = StandardScaler()
    df_new[numeric_columns] = scaler.fit_transform(df_new[numeric_columns])
    print("Scaling complete!")
else:
    print("No data available after dropping missing values.")


Missing values per column:
now_cost               0
expected_goals         0
expected_assists       0
total_points           0
form                   0
minutes                0
selected_by_percent    0
dtype: int64

Rows with missing values:
Empty DataFrame
Columns: [assists, bonus, bps, chance_of_playing_next_round, chance_of_playing_this_round, clean_sheets, clean_sheets_per_90, code, corners_and_indirect_freekicks_order, corners_and_indirect_freekicks_text, cost_change_event, cost_change_event_fall, cost_change_start, cost_change_start_fall, creativity, creativity_rank, creativity_rank_type, direct_freekicks_order, direct_freekicks_text, dreamteam_count, element_type, ep_next, ep_this, event_points, expected_assists, expected_assists_per_90, expected_goal_involvements, expected_goal_involvements_per_90, expected_goals, expected_goals_conceded, expected_goals_conceded_per_90, expected_goals_per_90, first_name, form, form_rank, form_rank_type, goals_conceded, goals_conceded_per_90, go

In [2]:
from sklearn.cluster import KMeans

# Apply K-Means model (use the same number of clusters as before)
kmeans = KMeans(n_clusters=4, random_state=42)  
df_new['cluster'] = kmeans.fit_predict(df_new[numeric_columns])

# Extract differentials (low-ownership, high potential)
differentials_new = df_new[(df_new['cluster'] == 0) & (df_new['selected_by_percent'] < 10)]

print("\nTop Differential Players (K-Means Clustering):")
print(differentials_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (K-Means Clustering):
      web_name  selected_by_percent  expected_goals  expected_assists  \
5     J.Timber            -0.145128       -0.289604         -0.323728   
12        Rice             0.105909       -0.053176          0.143486   
15      Thomas            -0.128392       -0.112283          0.236929   
18    Trossard            -0.094921        0.242360         -0.074547   
23   Calafiori            -0.145128       -0.388116         -0.479466   
..         ...                  ...             ...               ...   
641   Hee Chan            -0.295751       -0.427521         -0.012252   
644    J.Gomes            -0.362694       -0.171390         -0.199137   
649   Mosquera            -0.195336        0.222658         -0.386023   
650   N.Semedo            -0.362694       -0.407818         -0.043399   
657       Toti            -0.345958       -0.427521          0.268077   

         form  total_points  
5    1.278070      1.058136  
12  -0.578643  

In [3]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest model
model = IsolationForest(contamination=0.1)  
df_new['anomaly'] = model.fit_predict(df_new[numeric_columns])

# Extract anomalies (overperforming differentials)
differentials_anomalies_new = df_new[df_new['anomaly'] == -1]

print("\nTop Differential Players (Isolation Forest):")
print(differentials_anomalies_new[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points']])



Top Differential Players (Isolation Forest):
      web_name  selected_by_percent  expected_goals  expected_assists  \
2      Gabriel             2.515869        1.542717         -0.292580   
3      Havertz             2.482397        2.783968         -0.105695   
10    Ødegaard             0.407154        0.478789          1.825457   
11        Raya             4.775207       -0.427521         -0.448318   
13        Saka             4.641320        2.330813          3.974641   
..         ...                  ...             ...               ...   
581     Romero             1.344361        1.010753          0.797586   
585        Son             1.093323        2.291408          2.977918   
600      Bowen             0.641456        0.715217          1.825457   
605  Fabianski             2.248096       -0.427521         -0.479466   
632      Cunha             0.005494        2.409622          2.012342   

         form  total_points  
2    3.528631      3.550584  
3    1.278070    

In [7]:
# Combine differentials from both models
final_differentials = pd.concat([differentials_new, differentials_anomalies_new]).drop_duplicates()

# Rank by xG + xA + form
final_differentials['differential_score'] = final_differentials['expected_goals'] + final_differentials['expected_assists'] + final_differentials['form']

# Apply additional filters:
filtered_differentials = final_differentials[
    (final_differentials['selected_by_percent'] < 5) &  # Ownership < 5%
    (final_differentials['expected_goals'] + final_differentials['expected_assists'] > 5.0) #&  # xG + xA > 5.0
    # (final_differentials['form'] > 2.5)  # Form > 2.5
]

# Sort and pick top 10
top_differentials = filtered_differentials.sort_values(by="differential_score", ascending=False).head(10)

print("\n🔥 Best Differentials for 2024-2025 FPL Gameweek 6 🔥")
print(top_differentials[['web_name', 'selected_by_percent', 'expected_goals', 'expected_assists', 'form', 'total_points', 'differential_score']])



🔥 Best Differentials for 2024-2025 FPL Gameweek 6 🔥
             web_name  selected_by_percent  expected_goals  expected_assists  \
577          Maddison             0.356947        2.370218          8.428748   
386         Luis Díaz             4.490697        4.754206          1.638571   
260            McNeil             0.072438        0.045336          8.117272   
405         De Bruyne             0.976173        1.956468          6.279564   
201         N.Jackson             1.411304        5.581706         -0.043399   
111            Mbeumo             1.879907        3.040099          3.507427   
585               Son             1.093323        2.291408          2.977918   
13               Saka             4.641320        2.330813          3.974641   
61            Watkins             4.440490        5.680218         -0.043399   
371  Alexander-Arnold             4.992772       -0.072878          5.750055   

         form  total_points  differential_score  
577  2.234558   