In [169]:
import pandas as pd

df = pd.read_csv("../data/raw/ufc_fights_raw.csv")
df.shape, df["event_url"].nunique()


((8482, 16), 756)

In [170]:
import pandas as pd
import numpy as np

# Load fights (with post-fight stats)
fights = pd.read_csv("../data/raw/ufc_fights_raw.csv")

# Load fighter index (name ↔ profile URL)
fighters_index = pd.read_csv("../data/raw/fighters_index.csv")

# Load fighter profiles (profile URL ↦ height/reach/SLpM/etc.)
fighter_profiles = pd.read_csv("../data/raw/fighter_profiles_raw.csv")

fights.shape, fighters_index.shape, fighter_profiles.shape


((8482, 16), (500, 2), (500, 15))

In [171]:
# Build a per-fighter fight history (long format: one row per fighter per fight)
fights = fights.copy().reset_index().rename(columns={"index": "fight_index"})

rows = []

for _, row in fights.iterrows():
    # Figure out the winner label for this fight
    winner = row["winner"]  # "Red", "Blue", or maybe None

    # Red fighter row
    if winner == "Red":
        red_result = 1
        blue_result = 0
    elif winner == "Blue":
        red_result = 0
        blue_result = 1
    else:
        # No clear winner (draw, NC, etc.) – we won't use these rows for history stats
        red_result = np.nan
        blue_result = np.nan

    rows.append({
        "fight_url": row["fight_url"],
        "fighter_name": row["red_fighter"],
        "is_red": 1,
        "opponent_name": row["blue_fighter"],
        "fight_index": row["fight_index"],
        "result": red_result,
    })

    # Blue fighter row
    rows.append({
        "fight_url": row["fight_url"],
        "fighter_name": row["blue_fighter"],
        "is_red": 0,
        "opponent_name": row["red_fighter"],
        "fight_index": row["fight_index"],
        "result": blue_result,
    })

fighter_history = pd.DataFrame(rows)
fighter_history.head()


Unnamed: 0,fight_url,fighter_name,is_red,opponent_name,fight_index,result
0,http://ufcstats.com/fight-details/4a0db214d972...,Merab Dvalishvili,1,Petr Yan,0,
1,http://ufcstats.com/fight-details/4a0db214d972...,Petr Yan,0,Merab Dvalishvili,0,
2,http://ufcstats.com/fight-details/dfa692db6d39...,Alexandre Pantoja,1,Joshua Van,1,
3,http://ufcstats.com/fight-details/dfa692db6d39...,Joshua Van,0,Alexandre Pantoja,1,
4,http://ufcstats.com/fight-details/fbbb9e72900b...,Brandon Moreno,1,Tatsuro Taira,2,


In [172]:
def add_recent_form(group: pd.DataFrame) -> pd.DataFrame:
    # Sort by fight_index to approximate chronology
    group = group.sort_values("fight_index").copy()

    r = group["result"].astype(float)

    # Pre-fight cumulative stats (shift(1) so we don't include current fight)
    group["prev_fights"] = r.shift(1).expanding().count()
    group["prev_wins"] = r.shift(1).expanding().sum()

    # Overall pre-fight win % (avoid 0/0)
    group["prev_win_pct"] = group["prev_wins"] / group["prev_fights"]
    group.loc[group["prev_fights"] == 0, "prev_win_pct"] = 0.5  # neutral

    # Recent form (last 3/5 fights); if no history, call it neutral too
    group["prev_win_pct_3"] = r.shift(1).rolling(window=3, min_periods=1).mean()
    group["prev_win_pct_5"] = r.shift(1).rolling(window=5, min_periods=1).mean()

    group["prev_win_pct_3"] = group["prev_win_pct_3"].fillna(0.5)
    group["prev_win_pct_5"] = group["prev_win_pct_5"].fillna(0.5)

    return group


# Apply per-fighter
fighter_history_with_form = (
    fighter_history
    .dropna(subset=["result"])  # drop fights with no clear winner
    .groupby("fighter_name", group_keys=False)
    .apply(add_recent_form)
)

fighter_history_with_form.head()


Unnamed: 0,fight_url,fighter_name,is_red,opponent_name,fight_index,result,prev_fights,prev_wins,prev_win_pct,prev_win_pct_3,prev_win_pct_5
28,http://ufcstats.com/fight-details/5f5b626e6752...,Arman Tsarukyan,1,Dan Hooker,14,1.0,0.0,,0.5,0.5,0.5
29,http://ufcstats.com/fight-details/5f5b626e6752...,Dan Hooker,0,Arman Tsarukyan,14,0.0,0.0,,0.5,0.5,0.5
30,http://ufcstats.com/fight-details/b2218930b982...,Ian Machado Garry,1,Belal Muhammad,15,0.0,0.0,,0.5,0.5,0.5
31,http://ufcstats.com/fight-details/b2218930b982...,Belal Muhammad,0,Ian Machado Garry,15,1.0,0.0,,0.5,0.5,0.5
32,http://ufcstats.com/fight-details/870d374f8d9a...,Volkan Oezdemir,1,Alonzo Menifield,16,1.0,0.0,,0.5,0.5,0.5


In [173]:
# Split back into red and blue rows, then merge to main fights table

# Red side (is_red = 1)
red_form = (
    fighter_history_with_form[fighter_history_with_form["is_red"] == 1][
        ["fight_url", "prev_fights", "prev_win_pct", "prev_win_pct_3", "prev_win_pct_5"]
    ]
    .rename(columns={
        "prev_fights": "red_prev_fights",
        "prev_win_pct": "red_prev_win_pct",
        "prev_win_pct_3": "red_prev_win_pct_3",
        "prev_win_pct_5": "red_prev_win_pct_5",
    })
)

# Blue side (is_red = 0)
blue_form = (
    fighter_history_with_form[fighter_history_with_form["is_red"] == 0][
        ["fight_url", "prev_fights", "prev_win_pct", "prev_win_pct_3", "prev_win_pct_5"]
    ]
    .rename(columns={
        "prev_fights": "blue_prev_fights",
        "prev_win_pct": "blue_prev_win_pct",
        "prev_win_pct_3": "blue_prev_win_pct_3",
        "prev_win_pct_5": "blue_prev_win_pct_5",
    })
)

# Merge onto fights by fight_url
fights = (
    fights
    .merge(red_form, on="fight_url", how="left")
    .merge(blue_form, on="fight_url", how="left")
)

fights[[
    "red_fighter", "blue_fighter",
    "red_prev_fights", "blue_prev_fights",
    "red_prev_win_pct_5", "blue_prev_win_pct_5"
]].head()


Unnamed: 0,red_fighter,blue_fighter,red_prev_fights,blue_prev_fights,red_prev_win_pct_5,blue_prev_win_pct_5
0,Merab Dvalishvili,Petr Yan,,,,
1,Alexandre Pantoja,Joshua Van,,,,
2,Brandon Moreno,Tatsuro Taira,,,,
3,Henry Cejudo,Payton Talbott,,,,
4,Jan Blachowicz,Bogdan Guskov,,,,


In [174]:
# Map fighter_name -> fighter_url using the index
name_to_url = (
    fighters_index
    .drop_duplicates("fighter_name")
    .set_index("fighter_name")["fighter_url"]
)

# Attach URLs to fights
fights["red_fighter_url"] = fights["red_fighter"].map(name_to_url)
fights["blue_fighter_url"] = fights["blue_fighter"].map(name_to_url)

fights[["red_fighter", "red_fighter_url", "blue_fighter", "blue_fighter_url"]].head()


Unnamed: 0,red_fighter,red_fighter_url,blue_fighter,blue_fighter_url
0,Merab Dvalishvili,http://ufcstats.com/fighter-details/c03520b5c8...,Petr Yan,http://ufcstats.com/fighter-details/d661ce4da7...
1,Alexandre Pantoja,http://ufcstats.com/fighter-details/a0f0004aad...,Joshua Van,http://ufcstats.com/fighter-details/17e9764940...
2,Brandon Moreno,http://ufcstats.com/fighter-details/792be9a24d...,Tatsuro Taira,http://ufcstats.com/fighter-details/4461d7e473...
3,Henry Cejudo,http://ufcstats.com/fighter-details/056c493bbd...,Payton Talbott,http://ufcstats.com/fighter-details/6e743a33d5...
4,Jan Blachowicz,http://ufcstats.com/fighter-details/99df7d0a2a...,Bogdan Guskov,http://ufcstats.com/fighter-details/ef5dcb10d2...


In [175]:
fights_with_urls = fights.dropna(subset=["red_fighter_url", "blue_fighter_url"]).copy()

fights.shape, fights_with_urls.shape


((8482, 27), (1147, 27))

In [176]:
# Drop str_def for now since it's all NaN
fighter_profiles = fighter_profiles.drop(columns=["str_def"])

# Red profiles: prefix all columns with 'red_' except URL
red_profiles = fighter_profiles.rename(
    columns=lambda c: f"red_{c}" if c != "fighter_url" else "red_fighter_url"
)

# Blue profiles: prefix all columns with 'blue_' except URL
blue_profiles = fighter_profiles.rename(
    columns=lambda c: f"blue_{c}" if c != "fighter_url" else "blue_fighter_url"
)

red_profiles.head()


Unnamed: 0,red_fighter_url,red_fighter_name,red_height_in,red_reach_in,red_stance,red_dob,red_slpm,red_sapm,red_str_acc,red_td_avg,red_td_acc,red_td_def,red_sub_avg,red_fighter_name_index
0,http://ufcstats.com/fighter-details/c03520b5c8...,Merab Dvalishvili,66.0,68.0,Orthodox,"Jan 10, 1991",4.33,2.55,42.0,6.4,37.0,82.0,0.3,Merab Dvalishvili
1,http://ufcstats.com/fighter-details/d661ce4da7...,Petr Yan,67.0,67.0,Switch,"Feb 11, 1993",5.12,4.14,54.0,1.58,48.0,85.0,0.1,Petr Yan
2,http://ufcstats.com/fighter-details/a0f0004aad...,Alexandre Pantoja,65.0,67.0,Orthodox,"Apr 16, 1990",4.36,3.88,50.0,2.8,47.0,69.0,1.0,Alexandre Pantoja
3,http://ufcstats.com/fighter-details/17e9764940...,Joshua Van,65.0,65.0,Orthodox,"Oct 10, 2001",8.86,6.36,56.0,0.85,63.0,81.0,0.0,Joshua Van
4,http://ufcstats.com/fighter-details/792be9a24d...,Brandon Moreno,67.0,70.0,Orthodox,"Dec 07, 1993",3.96,3.62,44.0,1.51,44.0,64.0,0.4,Brandon Moreno


In [177]:
merged = (
    fights_with_urls
    .merge(red_profiles, on="red_fighter_url", how="inner")
    .merge(blue_profiles, on="blue_fighter_url", how="inner")
)

merged.shape


(1147, 53)

In [178]:
required_cols = [
    "red_height_in", "blue_height_in",
    "red_reach_in", "blue_reach_in",
    "red_slpm", "blue_slpm",
    "red_sapm", "blue_sapm",
    "red_str_acc", "blue_str_acc",
    "red_td_avg", "blue_td_avg",
    "red_td_acc", "blue_td_acc",
    "red_td_def", "blue_td_def",
    "red_sub_avg", "blue_sub_avg",
    "winner"
]

print("Missing values before filtering:")
print(merged[required_cols].isna().sum())

filtered = merged.dropna(subset=required_cols).copy()

print("\nRemaining rows after filtering:", len(filtered))


Missing values before filtering:
red_height_in      1
blue_height_in     0
red_reach_in       1
blue_reach_in      0
red_slpm           0
blue_slpm          0
red_sapm           0
blue_sapm          0
red_str_acc        0
blue_str_acc       0
red_td_avg         0
blue_td_avg        0
red_td_acc         0
blue_td_acc        0
red_td_def         0
blue_td_def        0
red_sub_avg        0
blue_sub_avg       0
winner            32
dtype: int64

Remaining rows after filtering: 1114


In [179]:
# Start from merged with profiles
model_df = filtered.copy()

# Keep only fights where we know who won (Red/Blue only)
model_df = model_df[model_df["winner"].isin(["Red", "Blue"])].copy()

# Target: 1 if Red wins, 0 if Blue wins
model_df["target"] = (model_df["winner"] == "Red").astype(int)

model_df[["red_fighter", "blue_fighter", "winner", "target"]].head()


Unnamed: 0,red_fighter,blue_fighter,winner,target
1,Merab Dvalishvili,Petr Yan,Blue,0
2,Aljamain Sterling,Petr Yan,Red,1
3,Aljamain Sterling,Petr Yan,Blue,0
4,Sean O'Malley,Petr Yan,Blue,0
5,Merab Dvalishvili,Cory Sandhagen,Red,1


In [180]:
filtered.shape, model_df.shape, model_df["target"].mean()


((1114, 53), (1114, 54), 0.5700179533213644)

In [None]:
# Convert DOB and event date
model_df["dob_red"] = pd.to_datetime(model_df["red_dob"], errors="coerce")
model_df["dob_blue"] = pd.to_datetime(model_df["blue_dob"], errors="coerce")
model_df["event_date"] = pd.to_datetime(model_df["event_date"], errors="coerce")

# Age in years at fight date
model_df["red_age"] = (model_df["event_date"] - model_df["dob_red"]).dt.days / 365.25
model_df["blue_age"] = (model_df["event_date"] - model_df["dob_blue"]).dt.days / 365.25

# Age difference
model_df["age_diff"] = model_df["red_age"] - model_df["blue_age"]

model_df[["red_age", "blue_age", "age_diff"]].head()


Unnamed: 0,red_age,blue_age,age_diff
1,34.893908,32.804928,2.08898
2,36.339493,32.804928,3.534565
3,36.339493,32.804928,3.534565
4,31.107461,32.804928,-1.697467
5,34.893908,33.61807,1.275838


In [182]:
stance_map = {
    "Orthodox": 0,
    "Southpaw": 1,
    "Switch": 2
}

model_df["red_stance_code"] = model_df["red_stance"].map(stance_map).fillna(-1)
model_df["blue_stance_code"] = model_df["blue_stance"].map(stance_map).fillna(-1)

# stance_diff = red stance - blue stance
model_df["stance_diff"] = model_df["red_stance_code"] - model_df["blue_stance_code"]

model_df[["red_stance", "blue_stance", "stance_diff"]].head()


Unnamed: 0,red_stance,blue_stance,stance_diff
1,Orthodox,Switch,-2.0
2,Orthodox,Switch,-2.0
3,Orthodox,Switch,-2.0
4,Switch,Switch,0.0
5,Orthodox,Switch,-2.0


In [183]:
# Count how many total fights each fighter has appeared in (red or blue)
red_counts = model_df["red_fighter"].value_counts()
blue_counts = model_df["blue_fighter"].value_counts()

# Combine counts
fight_counts = red_counts.add(blue_counts, fill_value=0).astype(int)

fight_counts.head()


Aaron Pico                1
Abdul Rakhman Yakhyaev    1
Abus Magomedov            5
Adam Fugitt               3
Aiemann Zahabi            3
Name: count, dtype: int32

In [184]:
model_df["red_num_fights"] = model_df["red_fighter"].map(fight_counts)
model_df["blue_num_fights"] = model_df["blue_fighter"].map(fight_counts)

model_df["experience_diff"] = model_df["red_num_fights"] - model_df["blue_num_fights"]

model_df[[
    "red_fighter", "red_num_fights",
    "blue_fighter", "blue_num_fights",
    "experience_diff"
]].head()


Unnamed: 0,red_fighter,red_num_fights,blue_fighter,blue_num_fights,experience_diff
1,Merab Dvalishvili,7,Petr Yan,7,0
2,Aljamain Sterling,8,Petr Yan,7,1
3,Aljamain Sterling,8,Petr Yan,7,1
4,Sean O'Malley,7,Petr Yan,7,0
5,Merab Dvalishvili,7,Cory Sandhagen,8,-1


In [185]:
# Offensive striking index: SLpM * (Striking Accuracy %)
model_df["red_off_striking"] = model_df["red_slpm"] * (model_df["red_str_acc"] / 100.0)
model_df["blue_off_striking"] = model_df["blue_slpm"] * (model_df["blue_str_acc"] / 100.0)

# Grappling efficiency index: TD AVG * (TD ACC %) * (TD DEF %)
model_df["red_grappling_index"] = (
    model_df["red_td_avg"] *
    (model_df["red_td_acc"] / 100.0) *
    (model_df["red_td_def"] / 100.0)
)

model_df["blue_grappling_index"] = (
    model_df["blue_td_avg"] *
    (model_df["blue_td_acc"] / 100.0) *
    (model_df["blue_td_def"] / 100.0)
)

# Damage margin: SLpM - SApM (more you hit than get hit)
model_df["red_damage_margin"] = model_df["red_slpm"] - model_df["red_sapm"]
model_df["blue_damage_margin"] = model_df["blue_slpm"] - model_df["blue_sapm"]

model_df[[
    "red_fighter", "blue_fighter",
    "red_off_striking", "blue_off_striking",
    "red_grappling_index", "blue_grappling_index",
    "red_damage_margin", "blue_damage_margin"
]].head()


Unnamed: 0,red_fighter,blue_fighter,red_off_striking,blue_off_striking,red_grappling_index,blue_grappling_index,red_damage_margin,blue_damage_margin
1,Merab Dvalishvili,Petr Yan,1.8186,2.7648,1.94176,0.64464,1.78,0.98
2,Aljamain Sterling,Petr Yan,2.314,2.7648,0.3234,0.64464,2.24,0.98
3,Aljamain Sterling,Petr Yan,2.314,2.7648,0.3234,0.64464,2.24,0.98
4,Sean O'Malley,Petr Yan,3.843,2.7648,0.069174,0.64464,2.82,0.98
5,Merab Dvalishvili,Cory Sandhagen,1.8186,2.187,1.94176,0.21896,1.78,1.39


In [None]:
# Physical differences
model_df["height_diff"] = model_df["red_height_in"] - model_df["blue_height_in"]
model_df["reach_diff"] = model_df["red_reach_in"] - model_df["blue_reach_in"]

# Striking volume/damage (per minute)
model_df["slpm_diff"] = model_df["red_slpm"] - model_df["blue_slpm"]
model_df["sapm_diff"] = model_df["red_sapm"] - model_df["blue_sapm"]

# Wrestling / grappling raw stats
model_df["td_avg_diff"] = model_df["red_td_avg"] - model_df["blue_td_avg"]
model_df["td_acc_diff"] = model_df["red_td_acc"] - model_df["blue_td_acc"]
model_df["td_def_diff"] = model_df["red_td_def"] - model_df["blue_td_def"]
model_df["sub_avg_diff"] = model_df["red_sub_avg"] - model_df["blue_sub_avg"]

# Advanced diffs
model_df["off_striking_diff"] = model_df["red_off_striking"] - model_df["blue_off_striking"]
model_df["grappling_index_diff"] = model_df["red_grappling_index"] - model_df["blue_grappling_index"]
model_df["damage_margin_diff"] = model_df["red_damage_margin"] - model_df["blue_damage_margin"]

# Recent form differences
model_df["prev_fights_diff"] = model_df["red_prev_fights"] - model_df["blue_prev_fights"]
model_df["prev_win_pct_diff"] = model_df["red_prev_win_pct"] - model_df["blue_prev_win_pct"]
model_df["prev_win_pct_3_diff"] = model_df["red_prev_win_pct_3"] - model_df["blue_prev_win_pct_3"]
model_df["prev_win_pct_5_diff"] = model_df["red_prev_win_pct_5"] - model_df["blue_prev_win_pct_5"]

# Encode weight class
model_df["weight_class_encoded"] = model_df["weight_class"].astype("category").cat.codes

model_df[[
    "red_fighter", "blue_fighter",
    "height_diff", "reach_diff",
    "slpm_diff", "sapm_diff",
    "td_avg_diff", "td_acc_diff", "td_def_diff", "sub_avg_diff",
    "off_striking_diff", "grappling_index_diff", "damage_margin_diff"
]].head()


Unnamed: 0,red_fighter,blue_fighter,height_diff,reach_diff,slpm_diff,sapm_diff,td_avg_diff,td_acc_diff,td_def_diff,sub_avg_diff,off_striking_diff,grappling_index_diff,damage_margin_diff
1,Merab Dvalishvili,Petr Yan,-1.0,1.0,-0.79,-1.59,4.82,-11.0,-3.0,0.2,-0.9462,1.29712,0.8
2,Aljamain Sterling,Petr Yan,0.0,4.0,-0.67,-1.93,0.87,-18.0,-41.0,0.5,-0.4508,-0.32124,1.26
3,Aljamain Sterling,Petr Yan,0.0,4.0,-0.67,-1.93,0.87,-18.0,-41.0,0.5,-0.4508,-0.32124,1.26
4,Sean O'Malley,Petr Yan,4.0,5.0,1.18,-0.66,-1.31,-6.0,-24.0,0.2,1.0782,-0.575466,1.84
5,Merab Dvalishvili,Cory Sandhagen,-5.0,-2.0,-0.53,-0.92,5.25,3.0,26.0,0.1,-0.3684,1.7228,0.39


In [None]:
feature_cols = [
    "height_diff",
    "reach_diff",
    "slpm_diff",
    "sapm_diff",
    "td_avg_diff",
    "td_acc_diff",
    "td_def_diff",
    "sub_avg_diff",
    "age_diff",
    "stance_diff",
    "experience_diff",
    "off_striking_diff",
    "grappling_index_diff",
    "damage_margin_diff",
    "prev_fights_diff",
    "prev_win_pct_diff",
    "prev_win_pct_3_diff",
    "prev_win_pct_5_diff",
    "weight_class_encoded",
]

print("Feature NaNs before filling:\n", model_df[feature_cols].isna().sum())

# Simple, safe imputation:
# - For fights with no prior history, treat win% as neutral (0.5)
# - For anything else weird, fall back to 0.0 so the model can learn around it

# Fill recent-form NaNs with neutral values
for col in [
    "prev_fights_diff",
    "prev_win_pct_diff",
    "prev_win_pct_3_diff",
    "prev_win_pct_5_diff",
]:
    model_df[col] = model_df[col].fillna(0.0)  # diffs: 0.0 = "no difference in form"

# If any of the other feature cols still have NaNs (from parsing issues etc.), set them to 0.0 too
for col in feature_cols:
    model_df[col] = model_df[col].fillna(0.0)

print("\nFeature NaNs after filling:\n", model_df[feature_cols].isna().sum())
model_df.shape


Feature NaNs before filling:
 height_diff             0
reach_diff              0
slpm_diff               0
sapm_diff               0
td_avg_diff             0
td_acc_diff             0
td_def_diff             0
sub_avg_diff            0
age_diff                0
stance_diff             0
experience_diff         0
off_striking_diff       0
grappling_index_diff    0
damage_margin_diff      0
prev_fights_diff        0
prev_win_pct_diff       0
prev_win_pct_3_diff     0
prev_win_pct_5_diff     0
dtype: int64

Feature NaNs after filling:
 height_diff             0
reach_diff              0
slpm_diff               0
sapm_diff               0
td_avg_diff             0
td_acc_diff             0
td_def_diff             0
sub_avg_diff            0
age_diff                0
stance_diff             0
experience_diff         0
off_striking_diff       0
grappling_index_diff    0
damage_margin_diff      0
prev_fights_diff        0
prev_win_pct_diff       0
prev_win_pct_3_diff     0
prev_win_pct_5_di

(1114, 84)

In [188]:
from sklearn.model_selection import train_test_split

X = model_df[feature_cols]
y = model_df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((891, 18), (223, 18))

In [189]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

pre_log_reg = LogisticRegression(max_iter=1000)
pre_log_reg.fit(X_train, y_train)

y_pred = pre_log_reg.predict(X_test)

print("Pre-fight Logistic Regression accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))


Pre-fight Logistic Regression accuracy: 0.5964125560538116

Classification report:

              precision    recall  f1-score   support

           0       0.54      0.43      0.48        96
           1       0.63      0.72      0.67       127

    accuracy                           0.60       223
   macro avg       0.58      0.58      0.57       223
weighted avg       0.59      0.60      0.59       223



In [190]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create and train the Random Forest model
rf = RandomForestClassifier(
    n_estimators=300,      # number of trees
    max_depth=6,          # restrict depth to reduce overfitting
    random_state=42
)

rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("RandomForest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_rf))


RandomForest Accuracy: 0.6053811659192825

Classification report:

              precision    recall  f1-score   support

           0       0.56      0.38      0.45        96
           1       0.62      0.78      0.69       127

    accuracy                           0.61       223
   macro avg       0.59      0.58      0.57       223
weighted avg       0.60      0.61      0.59       223



In [191]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

gb = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

print("GradientBoosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_gb))


GradientBoosting Accuracy: 0.57847533632287

Classification report:

              precision    recall  f1-score   support

           0       0.51      0.42      0.46        96
           1       0.61      0.70      0.65       127

    accuracy                           0.58       223
   macro avg       0.56      0.56      0.56       223
weighted avg       0.57      0.58      0.57       223



In [192]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# XGBoost is a boosted tree model that builds trees sequentially,
# each one correcting the mistakes of the previous ones.
xgb_model = XGBClassifier(
    n_estimators=400,      # number of trees
    learning_rate=0.05,   # smaller = slower but more precise learning
    max_depth=3,          # shallow trees to reduce overfitting
    subsample=0.9,        # use 90% of rows per tree
    colsample_bytree=0.9, # use 90% of features per tree
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
)

# Train on the same train set
xgb_model.fit(X_train, y_train)

# Predict on test set
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.5695067264573991

Classification report:

              precision    recall  f1-score   support

           0       0.50      0.41      0.45        96
           1       0.61      0.69      0.65       127

    accuracy                           0.57       223
   macro avg       0.55      0.55      0.55       223
weighted avg       0.56      0.57      0.56       223

