In [31]:
import pandas as pd
import numpy as np 

from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

In [3]:
dataset = pd.read_csv('conversion_predictors_of_clinically_isolated_syndrome_to_multiple_sclerosis.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               273 non-null    int64  
 1   Gender                   273 non-null    int64  
 2   Age                      273 non-null    int64  
 3   Schooling                272 non-null    float64
 4   Breastfeeding            273 non-null    int64  
 5   Varicella                273 non-null    int64  
 6   Initial_Symptom          272 non-null    float64
 7   Mono_or_Polysymptomatic  273 non-null    int64  
 8   Oligoclonal_Bands        273 non-null    int64  
 9   LLSSEP                   273 non-null    int64  
 10  ULSSEP                   273 non-null    int64  
 11  VEP                      273 non-null    int64  
 12  BAEP                     273 non-null    int64  
 13  Periventricular_MRI      273 non-null    int64  
 14  Cortical_MRI             2

In [4]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Gender,Age,Schooling,Breastfeeding,Varicella,Initial_Symptom,Mono_or_Polysymptomatic,Oligoclonal_Bands,LLSSEP,ULSSEP,VEP,BAEP,Periventricular_MRI,Cortical_MRI,Infratentorial_MRI,Spinal_Cord_MRI,Initial_EDSS,Final_EDSS,group
0,0,1,34,20.0,1,1,2.0,1,0,1,1,0,0,0,1,0,1,1.0,1.0,1
1,1,1,61,25.0,3,2,10.0,2,1,1,0,1,0,0,0,0,1,2.0,2.0,1
2,2,1,22,20.0,3,1,3.0,1,1,0,0,0,0,0,1,0,0,1.0,1.0,1
3,3,2,41,15.0,1,1,7.0,2,1,0,1,1,0,1,1,0,0,1.0,1.0,1
4,4,2,34,20.0,2,1,6.0,2,0,1,0,0,0,1,0,0,0,1.0,1.0,1


In [6]:
dataset.shape

(273, 20)

In [7]:
dataset['group'].unique()

array([1, 2])

To know if we can use the TabPFNClassifier, we should look on:
- Delete the id's row we have as 'Unnamed: 0' column
- Check the number of samples we have, we have 273
- Check the nunber of target, we have 2 {1, 2}

Once we checked all we have finished our preprocessing task so we can use the TabPFNClassifier. This algorithm can make a predictor from raw tabular data hence we checked the requirements:
- Less than 1000 train samples ( our dataset is already less than that )
- Less than 100 different features ( our has 20 without 'Unnamed: 0' column )
- Less than 10 different classes ( our has only 2 )

In [8]:
dataset.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273 entries, 0 to 272
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   273 non-null    int64  
 1   Age                      273 non-null    int64  
 2   Schooling                272 non-null    float64
 3   Breastfeeding            273 non-null    int64  
 4   Varicella                273 non-null    int64  
 5   Initial_Symptom          272 non-null    float64
 6   Mono_or_Polysymptomatic  273 non-null    int64  
 7   Oligoclonal_Bands        273 non-null    int64  
 8   LLSSEP                   273 non-null    int64  
 9   ULSSEP                   273 non-null    int64  
 10  VEP                      273 non-null    int64  
 11  BAEP                     273 non-null    int64  
 12  Periventricular_MRI      273 non-null    int64  
 13  Cortical_MRI             273 non-null    int64  
 14  Infratentorial_MRI       2

In [13]:
# Primero separamos los datos en X e y
X = dataset.drop('group', axis=1)
y = dataset['group']
X.shape, y.shape

((273, 18), (273,))

In [19]:
# Ahora separamos en conjunto de entrenamiento y testeo, como tenemos 
# pocas muestras voy a separalo en 0.85 - 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, shuffle=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((232, 18), (41, 18), (232,), (41,))

In [22]:
modelo = TabPFNClassifier(device='cpu')

history = modelo.fit(X_train, y_train)

y_pred, w_prob = modelo.predict(X_test, return_winning_probability=True)

roc_score = roc_auc_score(y_pred, y_test)

acc_score = accuracy_score( y_pred, y_test)

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [26]:
print(f"Tenemos :\n\t · Una AUC: {roc_score:.4f}\n\t · Una Accuracy: {acc_score:.4f}")

Tenemos :
	 · Una AUC: 0.9722
	 · Una Accuracy: 0.9756


I put shuffle flag on true and no random_seed cause i want to repeat the trainig and predicction task cause i want to prove that the algorithm works correctly with tabular data

Now i'm going to make a for loop to test it over the loop and make a mean of the results to see if it was just luck or it's a very good classifier algorithm

In [30]:
modelo = TabPFNClassifier(device='cpu')
roc_score_list = []
acc_score_list = []

for _ in range(1,20):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, shuffle=True)
    #
    history = modelo.fit(X_train, y_train)

    y_pred, w_prob = modelo.predict(X_test, return_winning_probability=True)

    roc_score_list.append( roc_auc_score(y_pred, y_test) )

    acc_score_list.append( accuracy_score( y_pred, y_test) )
    #
print(f"We got :\n\t · An avg AUC: {roc_score_list.mean():.4f}\n\t · An avg Accuracy: {acc_score_list.mean():.4f}")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the 

AttributeError: 'list' object has no attribute 'mean'