In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.6.1-py3-none-any.whl (2.2 MB)
Collecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp38-cp38-win_amd64.whl (2.0 MB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.8 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.6.1


In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [18]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df = pd.concat([train, test], axis=0)

In [145]:
batch_1 = train
# batch_1 = train[:4000]
# 200:600

In [146]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [147]:
tokenized = batch_1.text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [148]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [149]:
np.array(padded).shape

(7613, 84)

In [150]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(7613, 84)

In [151]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [152]:
features = last_hidden_states[0][:,0,:].numpy()

In [153]:
labels = batch_1.target

In [154]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [155]:
parameters = {'C': (0.5, 1, 2, 0.8, 0.3, 0.1, 0.05)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 0.1}
best scrores:  0.8143251610487006


In [156]:
lr_clf = LogisticRegression(C=0.1)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=0.1)

In [157]:
lr_clf.score(test_features, test_labels)

0.8235294117647058

In [158]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

scores = cross_val_score(clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.575 (+/- 0.00)


In [159]:
tokenized_test = test.text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [160]:
padded_test = np.array([i + [0]*(max_len-len(i)) for i in tokenized_test.values])

In [161]:
attention_mask_test = np.where(padded_test != 0, 1, 0)
attention_mask_test.shape

(3263, 84)

In [162]:
input_ids_test = torch.tensor(padded_test)  
attention_mask_test = torch.tensor(attention_mask_test)

with torch.no_grad():
    last_hidden_states_test = model(input_ids_test, attention_mask=attention_mask_test)

In [163]:
features_test = last_hidden_states_test[0][:,0,:].numpy()
y_predict = lr_clf.predict(features_test)

In [164]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = y_predict
submission.to_csv('submission_10.csv', index=False)