In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib as mpl

In [2]:
#Karpathy 1. fix random seed
tf.random.set_seed(42)
random.seed(42)
np.random.seed(42)

In [3]:
ATT_FILE_NAME = "FootballPlayerPreparedCleanAttributesAlternativo.csv"
ONE_HOT_ENCODED_CLASSES_FILE_NAME = "FootballPlayerOneHotEncodedClassesAlternativo.csv"

In [4]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [4]:
att = pd.read_csv(ATT_FILE_NAME)
target = pd.read_csv(ONE_HOT_ENCODED_CLASSES_FILE_NAME)

In [6]:
from sklearn.model_selection import train_test_split

In [5]:
import numpy as np

In [6]:
cadenas = []
cadenas_target = []
for x,y in zip(att.to_numpy(),target.to_numpy()):
    temp = ['A football player with']
    for i in range(0,len(x)):
        temp.append(f' {att.columns[i]} of '+str(x[i])+',')
    cadenas.append((''.join(temp)[:-1]))
    cadenas_target.append(np.argmax(y,axis=0))
cadenas

['A football player with Age of 29.0, Potential of 75.0, Value of 65000000.0, Wage of 31000.0, Special of 1896.0, International Reputation of 2.0, Weak Foot of 3.0, Skill Moves of 3.0, Crossing of 62.0, Finishing of 70.0, HeadingAccuracy of 72.0, ShortPassing of 76.0, Volleys of 80.0, Dribbling of 75.0, Curve of 70.0, FKAccuracy of 49.0, LongPassing of 63.0, BallControl of 78.0, Acceleration of 68.0, SprintSpeed of 69.0, Agility of 66.0, Reactions of 76.0, Balance of 59.0, ShotPower of 77.0, Jumping of 60.0, Stamina of 69.0, Strength of 77.0, LongShots of 74.0, Aggression of 76.0, Interceptions of 43.0, Positioning of 74.0, Vision of 73.0, Penalties of 76.0, Composure of 73.0, Marking of 48.0, StandingTackle of 38.0, SlidingTackle of 32.0',
 'A football player with Age of 26.0, Potential of 69.0, Value of 11000000.0, Wage of 7000.0, Special of 1811.0, International Reputation of 1.0, Weak Foot of 3.0, Skill Moves of 3.0, Crossing of 70.0, Finishing of 62.0, HeadingAccuracy of 56.0, Sho

In [7]:
df = pd.DataFrame(data=np.reshape(cadenas,(-1,1)),columns=['text'])

In [8]:
df['label'] = np.reshape(cadenas_target,(-1,1))

In [9]:
from datasets import Dataset
ds = Dataset.from_pandas(df)
ds = ds.with_format('torch')
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [11]:
ds = ds.map(preprocess_function, batched=True)

100%|██████████| 17/17 [00:02<00:00,  7.78ba/s]


In [12]:
ds = ds.train_test_split(0.3)

In [13]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11285
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4837
    })
})

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [16]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric('accuracy')


In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=15,
    num_train_epochs=7,
    weight_decay=0.01,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 11285
  Num Epochs = 7
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4942
  Number of trainable parameters = 66956548
  0%|          | 0/4942 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 10%|█         | 500/4942 [01:04<09:14,  8.01it/s]Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json


{'loss': 0.5778, 'learning_rate': 1.7976527721570217e-05, 'epoch': 0.71}


Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
 14%|█▍        | 705/4942 [01:31<08:45,  8.06it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15

 14%|█▍        | 707/4942 [01:43<3:21:51,  2.86s/it]

{'eval_loss': 0.22736907005310059, 'eval_accuracy': 0.91027496382055, 'eval_runtime': 11.9075, 'eval_samples_per_second': 406.215, 'eval_steps_per_second': 27.126, 'epoch': 1.0}


 20%|██        | 1000/4942 [02:19<08:12,  8.01it/s] Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json


{'loss': 0.2483, 'learning_rate': 1.595305544314043e-05, 'epoch': 1.42}


Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
 29%|██▊       | 1411/4942 [03:12<07:18,  8.05it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15
                                                   
 29%|██▊       | 1413/4942 [03:24<2:49:53,  2.89s/it]

{'eval_loss': 0.16398029029369354, 'eval_accuracy': 0.9404589621666322, 'eval_runtime': 12.0297, 'eval_samples_per_second': 402.088, 'eval_steps_per_second': 26.85, 'epoch': 2.0}


 30%|███       | 1500/4942 [03:35<07:11,  7.98it/s]  Saving model checkpoint to ./results\checkpoint-1500
Configuration saved in ./results\checkpoint-1500\config.json


{'loss': 0.1874, 'learning_rate': 1.3929583164710645e-05, 'epoch': 2.12}


Model weights saved in ./results\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1500\special_tokens_map.json
 40%|████      | 2000/4942 [04:38<06:05,  8.06it/s]Saving model checkpoint to ./results\checkpoint-2000
Configuration saved in ./results\checkpoint-2000\config.json


{'loss': 0.1542, 'learning_rate': 1.190611088628086e-05, 'epoch': 2.83}


Model weights saved in ./results\checkpoint-2000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-2000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-2000\special_tokens_map.json
 43%|████▎     | 2117/4942 [04:54<05:50,  8.05it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15
                                                   
 43%|████▎     | 2119/4942 [05:06<2:15:05,  2.87s/it]

{'eval_loss': 0.19202496111392975, 'eval_accuracy': 0.9319826338639653, 'eval_runtime': 11.9535, 'eval_samples_per_second': 404.651, 'eval_steps_per_second': 27.021, 'epoch': 3.0}


 51%|█████     | 2500/4942 [05:54<05:04,  8.02it/s]  Saving model checkpoint to ./results\checkpoint-2500
Configuration saved in ./results\checkpoint-2500\config.json


{'loss': 0.133, 'learning_rate': 9.882638607851073e-06, 'epoch': 3.54}


Model weights saved in ./results\checkpoint-2500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-2500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-2500\special_tokens_map.json
 57%|█████▋    | 2823/4942 [06:35<04:23,  8.05it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15
                                                   
 57%|█████▋    | 2825/4942 [06:47<1:41:00,  2.86s/it]

{'eval_loss': 0.15752634406089783, 'eval_accuracy': 0.9491420301839983, 'eval_runtime': 11.9185, 'eval_samples_per_second': 405.84, 'eval_steps_per_second': 27.101, 'epoch': 4.0}


 61%|██████    | 3000/4942 [07:09<04:00,  8.08it/s]  Saving model checkpoint to ./results\checkpoint-3000
Configuration saved in ./results\checkpoint-3000\config.json


{'loss': 0.1167, 'learning_rate': 7.859166329421287e-06, 'epoch': 4.25}


Model weights saved in ./results\checkpoint-3000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-3000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-3000\special_tokens_map.json
 71%|███████   | 3500/4942 [08:13<02:59,  8.01it/s]Saving model checkpoint to ./results\checkpoint-3500
Configuration saved in ./results\checkpoint-3500\config.json


{'loss': 0.0962, 'learning_rate': 5.835694050991501e-06, 'epoch': 4.96}


Model weights saved in ./results\checkpoint-3500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-3500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-3500\special_tokens_map.json
 71%|███████▏  | 3529/4942 [08:17<02:59,  7.89it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15
                                                   
 71%|███████▏  | 3531/4942 [08:29<1:07:13,  2.86s/it]

{'eval_loss': 0.14715011417865753, 'eval_accuracy': 0.9596857556336572, 'eval_runtime': 11.894, 'eval_samples_per_second': 406.676, 'eval_steps_per_second': 27.157, 'epoch': 5.0}


 81%|████████  | 4000/4942 [09:28<01:56,  8.06it/s]  Saving model checkpoint to ./results\checkpoint-4000
Configuration saved in ./results\checkpoint-4000\config.json


{'loss': 0.0764, 'learning_rate': 3.812221772561716e-06, 'epoch': 5.67}


Model weights saved in ./results\checkpoint-4000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-4000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-4000\special_tokens_map.json
 86%|████████▌ | 4235/4942 [09:58<01:27,  8.07it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15
                                                   
 86%|████████▌ | 4237/4942 [10:10<33:34,  2.86s/it]

{'eval_loss': 0.1386837512254715, 'eval_accuracy': 0.9642340293570395, 'eval_runtime': 11.894, 'eval_samples_per_second': 406.676, 'eval_steps_per_second': 27.157, 'epoch': 6.0}


 91%|█████████ | 4500/4942 [10:43<00:54,  8.07it/s]Saving model checkpoint to ./results\checkpoint-4500
Configuration saved in ./results\checkpoint-4500\config.json


{'loss': 0.0655, 'learning_rate': 1.7887494941319306e-06, 'epoch': 6.37}


Model weights saved in ./results\checkpoint-4500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-4500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-4500\special_tokens_map.json
100%|█████████▉| 4941/4942 [11:39<00:00,  8.07it/s]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 15
                                                   
100%|██████████| 4942/4942 [11:51<00:00,  8.07it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 4942/4942 [11:51<00:00,  6.95it/s]

{'eval_loss': 0.14504550397396088, 'eval_accuracy': 0.9650609882158363, 'eval_runtime': 11.9008, 'eval_samples_per_second': 406.443, 'eval_steps_per_second': 27.141, 'epoch': 7.0}
{'train_runtime': 711.2483, 'train_samples_per_second': 111.065, 'train_steps_per_second': 6.948, 'train_loss': 0.17259166251305674, 'epoch': 7.0}





TrainOutput(global_step=4942, training_loss=0.17259166251305674, metrics={'train_runtime': 711.2483, 'train_samples_per_second': 111.065, 'train_steps_per_second': 6.948, 'train_loss': 0.17259166251305674, 'epoch': 7.0})

In [None]:
predictions = trainer.predict(ds['test'])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4837
  Batch size = 16
100%|██████████| 303/303 [00:11<00:00, 25.50it/s]


tensor([2, 1, 2,  ..., 2, 1, 0])

In [None]:
print(classification_report(np.asarray(predictions.predictions).argmax(axis=1),ds['test'][:]['label']))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1245
           1       0.93      0.95      0.94      1132
           2       0.97      0.96      0.97      1368
           3       0.99      0.98      0.99      1092

    accuracy                           0.97      4837
   macro avg       0.97      0.97      0.97      4837
weighted avg       0.97      0.97      0.97      4837

