In [126]:
import pandas as pd

df = pd.read_csv("combined_data.csv")
print("Original shape:", df.shape)
ftr = df["FTR"]
df.head()


Original shape: (3900, 179)


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BMGMCA,BVCH,BVCD,BVCA,CLCH,CLCD,CLCA,LBCH,LBCD,LBCA
0,SP1,15/08/2024,18:00,Ath Bilbao,Getafe,1,1,D,1,0,...,,,,,,,,,,
1,SP1,15/08/2024,20:30,Betis,Girona,1,1,D,1,0,...,,,,,,,,,,
2,SP1,16/08/2024,18:00,Celta,Alaves,2,1,H,0,1,...,,,,,,,,,,
3,SP1,16/08/2024,20:30,Las Palmas,Sevilla,2,2,D,1,1,...,,,,,,,,,,
4,SP1,17/08/2024,18:00,Osasuna,Leganes,1,1,D,0,1,...,,,,,,,,,,


In [127]:
df.info()
print(df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Columns: 179 entries, Div to LBCA
dtypes: float64(156), int64(16), object(7)
memory usage: 5.3+ MB
Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'BMGMCA', 'BVCH', 'BVCD', 'BVCA', 'CLCH', 'CLCD', 'CLCA', 'LBCH',
       'LBCD', 'LBCA'],
      dtype='object', length=179)


In [128]:
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print("Removed", before - after, "duplicate rows")
print("After duplicates:", df.shape)


Removed 0 duplicate rows
After duplicates: (3900, 179)


In [129]:
min_non_null = len(df) * 0.5
df = df.dropna(axis=1, thresh=min_non_null)
print("After dropping very empty columns:", df.shape)

After dropping very empty columns: (3900, 99)


In [130]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

print("Numeric cols:", len(numeric_cols))
print("Categorical cols:", len(cat_cols))

Numeric cols: 92
Categorical cols: 7


In [131]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

print("‚úÖ Missing values handled")


‚úÖ Missing values handled


In [132]:
from sklearn.preprocessing import LabelEncoder


if "FTR" in df.columns:
    if df["FTR"].dtype == "object":
        le = LabelEncoder()
        df["FTREncoded"] = le.fit_transform(df["FTR"])
        print("‚úÖ FTR column encoded as numeric.")


    corr = df.corr(numeric_only=True)
    target = "FTREncoded"

    if target in corr.columns:
        corr_values = corr[target].sort_values(ascending=False)
        print("üîπ Highest correlated features:\n", corr_values.head(20))
        print("\nüîπ Lowest correlated features:\n", corr_values.tail(20))
    else:
        print("‚ö†Ô∏è Target column still not found.")
else:
    print("‚ùå FTR column not found in the dataset.")



‚úÖ FTR column encoded as numeric.
üîπ Highest correlated features:
 FTREncoded    1.000000
FTHG          0.613832
HTHG          0.413652
HST           0.345653
B365A         0.322149
PSCA          0.319370
PSA           0.318652
BWA           0.311001
WHA           0.310443
IWA           0.287933
VCA           0.278155
AvgA          0.267008
AvgCA         0.262039
B365CA        0.261037
MaxA          0.256867
MaxCA         0.250543
BWCA          0.248894
WHCA          0.242107
B365D         0.160485
WHD           0.158521
Name: FTREncoded, dtype: float64

üîπ Lowest correlated features:
 AS       -0.159995
WHCH     -0.265218
BWCH     -0.271736
MaxH     -0.279543
MaxCH    -0.282421
AvgH     -0.283797
AvgCH    -0.287353
B365CH   -0.287437
IWH      -0.306803
VCH      -0.308749
AHh      -0.322888
AHCh     -0.324831
WHH      -0.334558
PSH      -0.335878
BWH      -0.337023
PSCH     -0.337588
B365H    -0.339821
AST      -0.375232
HTAG     -0.387004
FTAG     -0.615761
Name: FTREncoded, dtyp

In [133]:
corr = df.corr(numeric_only=True)
target = "FTREncoded"

if target in corr.columns:
    low_corr_cols = [col for col in corr.columns if col != target and abs(corr[target].loc[col]) < 0.02]
    print("Will drop:", low_corr_cols)


    df = df.drop(columns=low_corr_cols)
    print(f"‚úÖ Dropped {len(low_corr_cols)} low-correlation columns.")
else:
    print("‚ö†Ô∏è Target column not found in correlation matrix.")


Will drop: ['AF', 'AC', 'AY', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA']
‚úÖ Dropped 15 low-correlation columns.


In [134]:

corr = df.corr(numeric_only=True)
target = "FTREncoded"

if target in corr.columns:
    # 1. Identify strong features
    strong_corr_cols = [col for col in corr.columns if abs(corr[target].loc[col]) >= 0.25 or col == target]
    print("Will keep:", strong_corr_cols)

    # 2. CORRECTED LINE: proper subsetting
    # We use .copy() to avoid SettingWithCopy warnings
    df = df[strong_corr_cols].copy()
    
    # Note: We do NOT need to add df['FTR'] manually here. 
    # You already saved it to the variable 'ftr' at the very start of your script,
    # and you act correctly by adding it back at the very end.

    print(f"‚úÖ Kept {len(strong_corr_cols)} highly correlated features.")
else:
    print("‚ö†Ô∏è Target column not found in correlation matrix.")

Will keep: ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HST', 'AST', 'B365H', 'B365A', 'BWH', 'BWA', 'PSH', 'PSA', 'WHH', 'WHA', 'MaxH', 'MaxA', 'AvgH', 'AvgA', 'AHh', 'B365CH', 'B365CA', 'BWCH', 'PSCH', 'PSCA', 'WHCH', 'MaxCH', 'MaxCA', 'AvgCH', 'AvgCA', 'AHCh', 'IWH', 'IWA', 'VCH', 'VCA', 'FTREncoded']
‚úÖ Kept 35 highly correlated features.


In [135]:
from sklearn.preprocessing import LabelEncoder


cat_cols = df.select_dtypes(include=["object"]).columns

le = LabelEncoder()
for c in cat_cols:
    if c in df.columns:
        df[c] = le.fit_transform(df[c])

print("‚úÖ Categorical columns encoded")


‚úÖ Categorical columns encoded


In [136]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_cols = [c for c in numeric_cols if c in df.columns]

if numeric_cols:
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print("‚úÖ Numeric columns normalized")
else:
    print("‚ö†Ô∏è No numeric columns found to normalize")


‚úÖ Numeric columns normalized


In [137]:
possible_cols = ["FTHG", "FTAG", "HTHG", "HTAG", "HS", "AS", "HST", "AST"]
outlier_cols = [c for c in possible_cols if c in df.columns]

print("Outlier check on:", outlier_cols)

if outlier_cols:
    Q1 = df[outlier_cols].quantile(0.25)
    Q3 = df[outlier_cols].quantile(0.75)
    IQR = Q3 - Q1

    before = df.shape[0]
    df = df[~((df[outlier_cols] < (Q1 - 3 * IQR)) | (df[outlier_cols] > (Q3 + 3 * IQR))).any(axis=1)]
    after = df.shape[0]
    print("Removed", before - after, "outliers")
else:
    print("No outliers step applied")


Outlier check on: ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HST', 'AST']
Removed 37 outliers


In [138]:
df.head()

Unnamed: 0,FTHG,FTAG,HTHG,HTAG,HST,AST,B365H,B365A,BWH,BWA,...,MaxCH,MaxCA,AvgCH,AvgCA,AHCh,IWH,IWA,VCH,VCA,FTREncoded
0,-0.381231,-0.121387,0.410169,-0.691476,-0.260731,-0.764263,-0.632409,0.79015,-0.682781,0.79776,...,-0.742045,0.958909,-0.755015,1.115374,-1.21914,-0.238407,-0.27108,-0.258239,-0.267293,1
1,-0.381231,-0.121387,0.410169,-0.691476,-0.260731,-0.764263,-0.181811,-0.420159,-0.143598,-0.420752,...,-0.119511,-0.416052,-0.126615,-0.39986,0.062436,-0.238407,-0.27108,-0.258239,-0.267293,1
2,0.399242,-0.121387,-0.787181,0.715283,-0.260731,-0.764263,-0.363086,-0.153891,-0.340175,-0.204974,...,-0.369956,-0.117983,-0.333325,-0.101524,0.062436,-0.238407,-0.27108,-0.258239,-0.267293,2
3,0.399242,0.780346,0.410169,0.715283,0.139446,0.629421,0.051256,-0.509722,0.109144,-0.522295,...,0.080845,-0.499382,0.129706,-0.505848,0.489628,-0.238407,-0.27108,-0.258239,-0.267293,1
4,-0.381231,-0.121387,-0.787181,0.715283,0.139446,0.16486,-0.440775,-0.057066,-0.469354,-0.001889,...,-0.541689,0.260211,-0.523499,0.314577,-0.364756,-0.238407,-0.27108,-0.258239,-0.267293,1


In [139]:
df["FTR"] = ftr
df.to_csv("combined_data_clean.csv", index=False)
print("Saved cleaned data as combined_data_clean.csv")
print("Final shape:", df.shape)


Saved cleaned data as combined_data_clean.csv
Final shape: (3863, 36)


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3863 entries, 0 to 3899
Data columns (total 36 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   FTHG        3863 non-null   float64
 1   FTAG        3863 non-null   float64
 2   HTHG        3863 non-null   float64
 3   HTAG        3863 non-null   float64
 4   HST         3863 non-null   float64
 5   AST         3863 non-null   float64
 6   B365H       3863 non-null   float64
 7   B365A       3863 non-null   float64
 8   BWH         3863 non-null   float64
 9   BWA         3863 non-null   float64
 10  PSH         3863 non-null   float64
 11  PSA         3863 non-null   float64
 12  WHH         3863 non-null   float64
 13  WHA         3863 non-null   float64
 14  MaxH        3863 non-null   float64
 15  MaxA        3863 non-null   float64
 16  AvgH        3863 non-null   float64
 17  AvgA        3863 non-null   float64
 18  AHh         3863 non-null   float64
 19  B365CH      3863 non-null   floa