# Single BERT Sample

> https://medium.com/towards-artificial-intelligence/text-classification-with-simple-transformers-a29d13358135



In [1]:
!pip install --upgrade transformers
!pip install simpletransformers

Requirement already up-to-date: transformers in /usr/local/lib/python3.6/dist-packages (4.1.1)
Collecting watchdog<0.10.5,>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/6f/10/500580a0987363a0d9e1f3dd5cb1bba94a47e19266c6ce9dfb6cdd455758/watchdog-0.10.4.tar.gz (98kB)
[K     |████████████████████████████████| 102kB 8.2MB/s 
Collecting pathtools>=0.1.1
  Downloading https://files.pythonhosted.org/packages/e7/7f/470d6fcdf23f9f3518f6b0b76be9df16dcc8630ad409947f8be2eb0ed13a/pathtools-0.1.2.tar.gz
Building wheels for collected packages: watchdog, pathtools
  Building wheel for watchdog (setup.py) ... [?25l[?25hdone
  Created wheel for watchdog: filename=watchdog-0.10.4-cp36-none-any.whl size=74842 sha256=719a2b69caef53050ac0492335acad8945b6ea368cc2e8a6dd66e81d1002108f
  Stored in directory: /root/.cache/pip/wheels/9e/11/04/5160b8815b0cc7cf574bdc6d053e510169ec264c8791b4ec3a
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
  Created wheel for pathtools: file

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('Classificação Real')
    plt.xlabel('Classificação Predita');

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

file_path = "/content/drive/My Drive/data_full.csv"

data = pd.read_csv(file_path, sep=';').replace({np.NaN: None})
data.columns = ['sentenca','entidade1','entidade1_tipo','relacao','entidade2','entidade2_tipo']

cols = ['entidade1', 'entidade2']
data['entidades'] = data[cols].apply(lambda row: ' , '.join(row.values.astype(str)), axis=1)

data['class'] = data['relacao'].apply(lambda x: 0 if x is None else 1)

data.head()

Mounted at /content/drive


Unnamed: 0,sentenca,entidade1,entidade1_tipo,relacao,entidade2,entidade2_tipo,entidades,class
0,World Alliance of Reformed Churches condena a ...,Iraque,LOC,condena a guerra no,World Alliance of,ORG,"Iraque , World Alliance of",1
1,Em 19 de Dezembro de 1974 foi assinado um acor...,PAIGC,ORG,acordo entre,Portugal,LOC,"PAIGC , Portugal",1
2,Em 19 de Dezembro de 1974 foi assinado um acor...,PAIGC,ORG,,Cabo Verde,LOC,"PAIGC , Cabo Verde",0
3,Em 19 de Dezembro de 1974 foi assinado um acor...,Portugal,LOC,,Cabo Verde,LOC,"Portugal , Cabo Verde",0
4,"A 88i , plataforma de serviços digitais para s...",88i,ORG,foi aprovada no,Global Startup Program,ORG,"88i , Global Startup Program",1


In [5]:
data['sentence'] = data[['sentenca', 'entidades']].apply(lambda row: ' , '.join(row.values.astype(str)), axis=1)
#data = data[['sentence','class']]
data.head()

Unnamed: 0,sentenca,entidade1,entidade1_tipo,relacao,entidade2,entidade2_tipo,entidades,class,sentence
0,World Alliance of Reformed Churches condena a ...,Iraque,LOC,condena a guerra no,World Alliance of,ORG,"Iraque , World Alliance of",1,World Alliance of Reformed Churches condena a ...
1,Em 19 de Dezembro de 1974 foi assinado um acor...,PAIGC,ORG,acordo entre,Portugal,LOC,"PAIGC , Portugal",1,Em 19 de Dezembro de 1974 foi assinado um acor...
2,Em 19 de Dezembro de 1974 foi assinado um acor...,PAIGC,ORG,,Cabo Verde,LOC,"PAIGC , Cabo Verde",0,Em 19 de Dezembro de 1974 foi assinado um acor...
3,Em 19 de Dezembro de 1974 foi assinado um acor...,Portugal,LOC,,Cabo Verde,LOC,"Portugal , Cabo Verde",0,Em 19 de Dezembro de 1974 foi assinado um acor...
4,"A 88i , plataforma de serviços digitais para s...",88i,ORG,foi aprovada no,Global Startup Program,ORG,"88i , Global Startup Program",1,"A 88i , plataforma de serviços digitais para s..."


In [6]:
train_df, test_df = train_test_split(data, test_size=0.2, stratify=data['class'], random_state=42)
test_df, valid_df = train_test_split(test_df, test_size=0.5, stratify=test_df['class'], random_state=42)

print("train_df:", train_df.shape)
print("test_df:", test_df.shape)
print("valid_df:", valid_df.shape)

data.info()
train_df.info()
test_df.info()
valid_df.info()

train_df: (2630, 9)
test_df: (329, 9)
valid_df: (329, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3288 entries, 0 to 3287
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentenca        3288 non-null   object
 1   entidade1       3288 non-null   object
 2   entidade1_tipo  3286 non-null   object
 3   relacao         1485 non-null   object
 4   entidade2       3288 non-null   object
 5   entidade2_tipo  3286 non-null   object
 6   entidades       3288 non-null   object
 7   class           3288 non-null   int64 
 8   sentence        3288 non-null   object
dtypes: int64(1), object(8)
memory usage: 231.3+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2630 entries, 1834 to 1586
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sentenca        2630 non-null   object
 1   entidade1       2630 non-null   object
 2   entidade1_t

In [7]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 2
model_args.manual_seed = 42
model_args.overwrite_output_dir = True

model = ClassificationModel(
    model_type='bert', 
    model_name='neuralmind/bert-large-portuguese-cased', 
    use_cuda=True, 
    num_labels=2, 
    args=model_args
)


Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from th

Downloading:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/155 [00:00<?, ?B/s]

In [8]:
model.train_model(train_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/2630 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
result, model_outputs, wrong_preds = model.eval_model(valid_df)

In [None]:
def sigmoid(x):
    z = (1/(1 + np.exp(-x)))
    return z    

In [None]:
sigmoid_preds = []
predictions = []

for x in model_outputs:
    sigmoid_pred = sigmoid(x)
    sigmoid_preds.append(np.argmax(sigmoid_pred))
    predictions.append(np.argmax(x))


sigmoid_preds = np.array(sigmoid_preds)

print('Arg-Max F1-score Validation:', round(f1_score(valid_df['class'], predictions), 4))
print('Sigmoide F1-score Validation:', round(f1_score(valid_df['class'], sigmoid_preds), 4))

In [None]:
test_df.reset_index(drop=True, inplace=True)
test_predictions, raw_outputs = model.predict(test_df['sentence'])

In [None]:
test_predictions

In [None]:
class_names = ['negativa', 'positiva']
print(classification_report(test_df['class'], test_predictions, target_names=class_names))

In [None]:
print('F1-score Test:', round(f1_score(test_df['class'], test_predictions), 4))
#print('f1 score:', f1_score(test_df['class'], test_predictions))

In [None]:
cnf_matrix = confusion_matrix(test_df['class'], test_predictions)
df_cm = pd.DataFrame(cnf_matrix, index=class_names, columns=class_names)

show_confusion_matrix(df_cm)

In [None]:
model.args

In [None]:
df_final = pd.DataFrame(columns=['sentence','class','predicted_class'])
df_final['sentence'] = test_df['sentence']
df_final['class'] = test_df['class']
df_final['predicted_class'] = test_predictions

In [None]:
df_final.head()

In [None]:
from google.colab import files
df_final.to_csv('Predicoes.csv', sep=';', encoding='utf-8-sig') 
files.download('Predicoes.csv')