In [1]:
!git clone https://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@github.com/DadeOrsu/dm_project24_group_6.git

Cloning into 'dm_project24_group_6'...
remote: Enumerating objects: 1258, done.[K
remote: Counting objects: 100% (287/287), done.[K
remote: Compressing objects: 100% (188/188), done.[K
remote: Total 1258 (delta 188), reused 170 (delta 88), pack-reused 971 (from 1)[K
Receiving objects: 100% (1258/1258), 51.42 MiB | 16.41 MiB/s, done.
Resolving deltas: 100% (837/837), done.


In [2]:
cd dm_project24_group_6/src/task4_machine_learning/

/content/dm_project24_group_6/src/task4_machine_learning


# Classification with RIPPER
In this notebook we aim to build a classifier using `RIPPER`. `RIPPER` is one of the most popular rule based classifier that allows us to build a classifier using rules. The rules are built using the training data and then the rules are used to classify the test data.

In [1]:
import pandas as pd
from os import path
import numpy as np
from preprocessing import get_train_test_data

X_train, y_train, X_test, y_test, columns_to_keep = get_train_test_data()

Now we need to discretize the data to make it suitable for RIPPER. We will use the `discretize_data` function to discretize the data.

Finally, we will train the RIPPER classifier using the discretized data.

In [8]:
#function to discretize the variables
#input: the dataset and the list of variables' names to discretize
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())

        #generate a mapping from the variable's values to the number representation
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

Here we discretize both the training and the test data. We then train the RIPPER classifier using the discretized training data and then use the classifier to classify the test data.

In [9]:
X_train = discretize_data(X_train, [col for col in columns_to_keep if col != "top_20"])

X_test = discretize_data(X_test, [col for col in columns_to_keep if col != "top_20"])

X_train.head()

Unnamed: 0,bmi,career_points,career_duration(days),debut_year,difficulty_score,competitive_age,is_tarmac,climbing_efficiency,startlist_quality,avg_pos,bmi_num,career_points_num,career_duration(days)_num,debut_year_num,difficulty_score_num,competitive_age_num,is_tarmac_num,climbing_efficiency_num,startlist_quality_num,avg_pos_num
0,23.765432,0.0,0.0,1977.0,0.635375,22,True,0.006796,1241,0.0,510,0,0,7,483,4,1,394,563,0
1,20.897959,0.0,0.0,1974.0,0.635375,27,True,0.006796,1241,0.0,254,0,0,4,483,9,1,394,563,0
2,22.790329,0.0,0.0,1977.0,0.635375,24,True,0.006796,1241,0.0,437,0,0,7,483,6,1,394,563,0
3,21.46915,0.0,0.0,1970.0,0.635375,30,True,0.006796,1241,0.0,309,0,0,0,483,12,1,394,563,0
4,21.295295,0.0,0.0,1977.0,0.635375,27,True,0.006796,1241,0.0,293,0,0,7,483,9,1,394,563,0


Since the new columns terminate with "_num", we update the variable `columns_to_keep`

In [10]:
columns_to_keep = [col + "_num" for col in columns_to_keep if col != "top_20"]
columns_to_keep

['bmi_num',
 'career_points_num',
 'career_duration(days)_num',
 'debut_year_num',
 'difficulty_score_num',
 'competitive_age_num',
 'is_tarmac_num',
 'climbing_efficiency_num',
 'startlist_quality_num',
 'avg_pos_num']

In [11]:
X_train = X_train[columns_to_keep]
X_test = X_test[columns_to_keep]

In [12]:
X_train.head()

Unnamed: 0,bmi_num,career_points_num,career_duration(days)_num,debut_year_num,difficulty_score_num,competitive_age_num,is_tarmac_num,climbing_efficiency_num,startlist_quality_num,avg_pos_num
0,510,0,0,7,483,4,1,394,563,0
1,254,0,0,4,483,9,1,394,563,0
2,437,0,0,7,483,6,1,394,563,0
3,309,0,0,0,483,12,1,394,563,0
4,293,0,0,7,483,9,1,394,563,0


# Hyperparameter search

Since the dataset is higly imbalanced, we balance it by using by using different weights for the positive and negative classes.

In [None]:
from sklearn.utils import resample
import pandas as pd


train_set = pd.concat([X_train, y_train], axis=1)
train_set.columns = list(X_train.columns) + ['label']  

majority_class = train_set[train_set['label'] == 0]
minority_class = train_set[train_set['label'] == 1]

# Total number of samples
total_samples = len(train_set)

# we want to have the same number of samples for both classes
samples_per_class = total_samples // 2

# Undersaampling of the majority class
majority_downsampled = resample(
    majority_class,
    replace=False,               
    n_samples=samples_per_class, 
    random_state=42
)

# Oversampling of the minority class
minority_upsampled = resample(
    minority_class,
    replace=True,                
    n_samples=samples_per_class, 
    random_state=42
)


balanced_train_set = pd.concat([majority_downsampled, minority_upsampled])

# Divide the dataset between features and labels
X_train = balanced_train_set.drop(columns=['label'])
y_train = balanced_train_set['label']

print("Balanced distribution:")
print(y_train.value_counts())


Balanced distribution:
label
0    277229
1    277229
Name: count, dtype: int64


Now we search for the best hyperparemeters for the RIPPER classifier using a grid search.


We procede in the following steps:
1. We define the hyperparameters of the model so that we can tune them later by using a grid search.
2. We split the training data into a training and a validation set. The data is divided into 80% training and 20% validation.
3. The code iterates through a Parameter Grid to find the best hyperparameters for the model. The result of each combination of parameters is stored inside the `params_tested` list, so that they can be analyzed later.

In [14]:
from sklearn.metrics import classification_report
def report_scores(test_label, test_pred):
    print(classification_report(test_label,
                            test_pred,
                            target_names=['0', '1']))

In [31]:
import wittgenstein as lw
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.metrics import f1_score

NUM_FOLDS = 5
RANDOM_SEED = 42

model = lw.RIPPER()

hyper_params = {
    'k': [1, 3],
    'max_rules': [10, 30],
    'prune_size': [0.5],
    'max_rule_conds': [3, 4]
}

grid_params = ParameterGrid(hyper_params)

X_train_set, X_val_set, Y_train_set, Y_val_set = train_test_split(
    X_train,y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=RANDOM_SEED,
    shuffle=True
)

params_tested = list()

for comb in grid_params:
    ripper = lw.RIPPER(**comb)

    ripper.fit(X_train_set, Y_train_set)

    # Predictions
    Y_pred_train_set = ripper.predict(X_train_set)
    Y_pred_val_set = ripper.predict(X_val_set)

    # F1 scores
    train_f_score = f1_score(Y_train_set, Y_pred_train_set, average='macro')
    val_f_score = f1_score(Y_val_set, Y_pred_val_set, average='macro')

    # Storing results
    new_comb = comb
    new_comb |= {
        'train_f_score': train_f_score,
        'val_f_score': val_f_score
    }

    print(comb)
    report_scores(Y_val_set, Y_pred_val_set)
    params_tested.append(new_comb)



{'k': 1, 'max_rule_conds': 3, 'max_rules': 10, 'prune_size': 0.5, 'train_f_score': 0.5023848286742174, 'val_f_score': 0.5030903425313032}
              precision    recall  f1-score   support

           0       0.54      0.95      0.69     55446
           1       0.79      0.20      0.32     55446

    accuracy                           0.57    110892
   macro avg       0.67      0.57      0.50    110892
weighted avg       0.67      0.57      0.50    110892

{'k': 1, 'max_rule_conds': 3, 'max_rules': 30, 'prune_size': 0.5, 'train_f_score': 0.5444091195659709, 'val_f_score': 0.544531526683047}
              precision    recall  f1-score   support

           0       0.56      0.93      0.70     55446
           1       0.79      0.26      0.39     55446

    accuracy                           0.60    110892
   macro avg       0.68      0.60      0.54    110892
weighted avg       0.68      0.60      0.54    110892

{'k': 1, 'max_rule_conds': 4, 'max_rules': 10, 'prune_size': 0.5, 'trai

Since finding the best hyperparameters is computationally expensive, we store the results of each iteration in a file. We then load the results from the file and display the results.

In [32]:
import json

params_df=pd.DataFrame(params_tested)

params_df.sort_values(by='val_f_score',ascending=False)

params_df.to_csv('params_ripper/test_f1_averaged.csv')

In [34]:
pd.read_csv('params_ripper/test_f1_averaged.csv').sort_values(by='val_f_score',ascending=False)

Unnamed: 0.1,Unnamed: 0,k,max_rule_conds,max_rules,prune_size,train_f_score,val_f_score
5,5,3,3,30,0.5,0.550832,0.551298
1,1,1,3,30,0.5,0.544409,0.544532
7,7,3,4,30,0.5,0.540239,0.540052
4,4,3,3,10,0.5,0.528609,0.528792
3,3,1,4,30,0.5,0.519537,0.519353
0,0,1,3,10,0.5,0.502385,0.50309
2,2,1,4,10,0.5,0.489407,0.490445
6,6,3,4,10,0.5,0.489407,0.490445


In [36]:
best_model = lw.RIPPER(
    k = 3,
    max_rule_conds = 3,
    max_rules = 30,
    prune_size = 0.5
)

best_model.fit(X_train, y_train)

In [37]:
test_pred_ripper = best_model.predict(X_test)

In [38]:
report_scores(y_test, test_pred_ripper)

              precision    recall  f1-score   support

           0       0.82      0.48      0.61     30219
           1       0.11      0.36      0.16      5187

    accuracy                           0.47     35406
   macro avg       0.46      0.42      0.39     35406
weighted avg       0.71      0.47      0.54     35406



Ripper is built using rules as said before, so we can extract the rules from the model and display them.

In [39]:
for rule in ripper.ruleset_:
    print(rule)

[avg_pos_num=<19850.0^debut_year_num=<11.0]
[avg_pos_num=<19850.0]
[avg_pos_num=19850.0-47390.0^startlist_quality_num=<211.0]
[avg_pos_num=19850.0-47390.0^startlist_quality_num=211.0-291.0]
[avg_pos_num=19850.0-47390.0^climbing_efficiency_num=2669.0-3090.0]
[avg_pos_num=47390.0-78187.5^startlist_quality_num=<211.0]
[avg_pos_num=19850.0-47390.0^difficulty_score_num=2219.0-2484.0]
[avg_pos_num=19850.0-47390.0^difficulty_score_num=>3190.0^startlist_quality_num=339.0-380.0^competitive_age_num=8.0-9.0]
[avg_pos_num=19850.0-47390.0^climbing_efficiency_num=>3090.0]
[avg_pos_num=19850.0-47390.0^difficulty_score_num=>3190.0]
