# Classification with RIPPER
In this notebook we aim to build a classifier using RIPPER. RIPPER is one of the most popular rule based classifier.

In [38]:
import pandas as pd
from os import path
import numpy as np

races_final_path = path.join('..','dataset', 'races_cleaned.csv')
cyclists_final_path = path.join('..','dataset', 'cyclists_cleaned.csv')


cyclists_data = pd.read_csv(cyclists_final_path)
races_data = pd.read_csv(races_final_path)


To begin, we create a target variable named `top_20` based on the column `position`.

In [None]:

cyclists_data.rename(columns={'name': 'cyclist'}, inplace=True)


merged_data = races_data.merge(cyclists_data, left_on='cyclist', right_on='_url', how='inner')

merged_data['top_20'] = merged_data['position'].apply(lambda x: 1 if x <= 20 else 0)


merged_data['date'] = pd.to_datetime(merged_data['date'])

columns_to_keep = [
    'points', 'length', 'climb_total', 'profile', 'startlist_quality', 'cyclist_age',
    'is_tarmac', 'delta', 'top_20', 'weight', 'height'
]


train_set = merged_data[merged_data['date'] < '2022-01-01']
test_set = merged_data[merged_data['date'] >= '2022-01-01']

train_set = train_set[columns_to_keep]
test_set = test_set[columns_to_keep]


In [None]:
#function to discretize the variables
#input: the dataset and the list of variables' names to discretize
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset



In [41]:


train_set = discretize_data(train_set, columns_to_keep)
train_set.drop(columns=columns_to_keep, inplace=True)

test_set = discretize_data(test_set, columns_to_keep)
test_set.drop(columns=columns_to_keep, inplace=True)
train_set.head()

Unnamed: 0,points_num,length_num,climb_total_num,profile_num,startlist_quality_num,cyclist_age_num,is_tarmac_num,delta_num,top_20_num,weight_num,height_num
0,8,618,442,0,563,4,1,69,1,35,18
1,8,618,442,0,563,9,1,69,1,16,13
2,8,618,442,0,563,6,1,69,1,24,12
3,8,618,442,0,563,13,1,69,1,17,12
4,8,618,442,0,563,9,1,69,1,14,10


In [None]:
import wittgenstein as lw
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score


X_train = train_set.drop(columns=['top_20_num'])
y_train = train_set['top_20_num']


X_test = test_set.drop(columns=['top_20_num'])
y_test = test_set['top_20_num']


model = lw.RIPPER()

param_grid = {
    'k': [1, 2],  
    'max_rules': [10, 20], 
    'prune_size': [0.33]  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,  
    verbose=1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)


report = classification_report(y_test, y_pred)
best_params = grid_search.best_params_
accuracy = accuracy_score(y_test, y_pred)


best_params, accuracy, report

Fitting 3 folds for each of 4 candidates, totalling 12 fits


({'k': 2, 'max_rules': 10, 'prune_size': 0.33},
 0.8470880641699147,
 '              precision    recall  f1-score   support\n\n           0       0.86      0.98      0.92     30219\n           1       0.41      0.10      0.16      5187\n\n    accuracy                           0.85     35406\n   macro avg       0.64      0.54      0.54     35406\nweighted avg       0.80      0.85      0.81     35406\n')

In [44]:
print(report)

              precision    recall  f1-score   support

           0       0.86      0.98      0.92     30219
           1       0.41      0.10      0.16      5187

    accuracy                           0.85     35406
   macro avg       0.64      0.54      0.54     35406
weighted avg       0.80      0.85      0.81     35406



In [45]:
print(best_params)

{'k': 2, 'max_rules': 10, 'prune_size': 0.33}
