In [1]:
!git clone https://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@github.com/DadeOrsu/dm_project24_group_6

Cloning into 'dm_project24_group_6'...
remote: Enumerating objects: 1101, done.[K
remote: Counting objects: 100% (130/130), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 1101 (delta 77), reused 76 (delta 35), pack-reused 971 (from 1)[K
Receiving objects: 100% (1101/1101), 47.38 MiB | 10.04 MiB/s, done.
Resolving deltas: 100% (726/726), done.
Updating files: 100% (39/39), done.


In [2]:
cd dm_project24_group_6/src/task4_machine_learning/

/content/dm_project24_group_6/src/task4_machine_learning


# Classification with RIPPER
In this notebook we aim to build a classifier using RIPPER. RIPPER is one of the most popular rule based classifier.

In [7]:
import pandas as pd
from os import path
import numpy as np
from preprocessing import get_train_test_data

X_train, y_train, X_test, y_test, columns_to_keep = get_train_test_data()

To begin, we create a target variable named `top_20` based on the column `position`.

Now we need to discretize the data to make it suitable for RIPPER. We will use the `discretize_data` function to discretize the data.

Finally, we will train the RIPPER classifier using the discretized data.

In [8]:
#function to discretize the variables
#input: the dataset and the list of variables' names to discretize
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())

        #generate a mapping from the variable's values to the number representation
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [9]:
X_train = discretize_data(X_train, [col for col in columns_to_keep if col != "top_20"])

X_test = discretize_data(X_test, [col for col in columns_to_keep if col != "top_20"])

X_train.head()

Unnamed: 0,bmi,career_points,career_duration(days),debut_year,difficulty_score,competitive_age,is_tarmac,points,climbing_efficiency,startlist_quality,bmi_num,career_points_num,career_duration(days)_num,debut_year_num,difficulty_score_num,competitive_age_num,is_tarmac_num,points_num,climbing_efficiency_num,startlist_quality_num
0,23.765432,68034.221635,6233.0,1977.0,0.635375,22,True,100.0,0.006796,1241,494,2740,2356,7,35,3,1,5,29,183
1,20.897959,29429.221635,5212.0,1974.0,0.635375,27,True,100.0,0.006796,1241,244,2261,2209,4,35,8,1,5,29,183
2,22.790329,15880.0,2972.0,1977.0,0.635375,24,True,100.0,0.006796,1241,424,1633,1454,7,35,5,1,5,29,183
3,21.46915,6600.0,3606.0,1970.0,0.635375,30,True,100.0,0.006796,1241,299,839,1707,0,35,11,1,5,29,183
4,21.295295,17245.0,2192.0,1977.0,0.635375,27,True,100.0,0.006796,1241,283,1709,1059,7,35,8,1,5,29,183


Now we search for the best hyperparemeters for the RIPPER classifier using the `GridSearchCV` function.

Finally, we evaluate the model using the `classification_report` function.

As we can see from the classification report, the RIPPER classifier has an accuracy of 0.85. Unfortunately, while the precision and recall are high for the negative class, they are low for the positive class. This is likely due to the class imbalance in the dataset. We try to address this issue by using the `class_weight` parameter in the RIPPER classifier.

In [14]:
from sklearn.metrics import classification_report
def report_scores(test_label, test_pred):
    print(classification_report(test_label,
                            test_pred,
                            target_names=['0', '1']))

In [16]:
import wittgenstein as lw
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.metrics import f1_score

NUM_FOLDS = 5
RANDOM_SEED = 42

model = lw.RIPPER()

hyper_params = {
    'k': [1, 2, 3, 5],
    'max_rules': [10, 20, 30, 50],
    'prune_size': [0.33, 0.5, 0.7],
    'max_rule_conds': [2, 3, 4, 5]
}

grid_params = ParameterGrid(hyper_params)

X_train_set, X_val_set, Y_train_set, Y_val_set = train_test_split(
    X_train,y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=RANDOM_SEED,
    shuffle=True
)

params_tested = list()

for comb in grid_params:
    ripper = lw.RIPPER(**comb)

    ripper.fit(X_train_set, Y_train_set)

    # Predictions
    Y_pred_train_set = ripper.predict(X_train_set)
    Y_pred_val_set = ripper.predict(X_val_set)

    # F1 scores
    train_f_score = f1_score(Y_train_set, Y_pred_train_set, average='macro')
    val_f_score = f1_score(Y_val_set, Y_pred_val_set, average='macro')

    # Storing results
    new_comb = comb
    new_comb |= {
        'train_f_score': train_f_score,
        'val_f_score': val_f_score
    }

    print(comb)
    report_scores(Y_val_set, Y_pred_val_set)
    params_tested.append(new_comb)



{'k': 1, 'max_rule_conds': 2, 'max_rules': 10, 'prune_size': 0.33, 'train_f_score': 0.5975801619897771, 'val_f_score': 0.5969089101422387}
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      5927
           1       0.46      0.22      0.30      1266

    accuracy                           0.82      7193
   macro avg       0.66      0.58      0.60      7193
weighted avg       0.78      0.82      0.79      7193

{'k': 1, 'max_rule_conds': 2, 'max_rules': 10, 'prune_size': 0.5, 'train_f_score': 0.5961557246987313, 'val_f_score': 0.6061682287221813}
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      5927
           1       0.50      0.23      0.31      1266

    accuracy                           0.82      7193
   macro avg       0.68      0.59      0.61      7193
weighted avg       0.79      0.82      0.80      7193

{'k': 1, 'max_rule_conds': 2, 'max_rules': 10, 'prune_size': 0.7, 'tr

In [17]:
import json

params_df=pd.DataFrame(params_tested)

params_df.sort_values(by='val_f_score',ascending=False)

params_df.to_csv('params_ripper/test_f1_averaged.csv')

In [20]:
pd.read_csv('params_ripper/test_f1_averaged.csv')

Unnamed: 0.1,Unnamed: 0,k,max_rule_conds,max_rules,prune_size,train_f_score,val_f_score
0,0,1,2,10,0.33,0.597580,0.596909
1,1,1,2,10,0.50,0.596156,0.606168
2,2,1,2,10,0.70,0.523836,0.518278
3,3,1,2,20,0.33,0.618578,0.612736
4,4,1,2,20,0.50,0.538478,0.541554
...,...,...,...,...,...,...,...
187,187,5,5,30,0.50,0.544668,0.552517
188,188,5,5,30,0.70,0.544329,0.544005
189,189,5,5,50,0.33,0.544844,0.544636
190,190,5,5,50,0.50,0.545910,0.543640
