In [116]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
from transformers import AutoModel, AutoTokenizer
import warnings
warnings.filterwarnings('ignore')

In [117]:
import seaborn as sns
import matplotlib.pyplot as plt

import plotly
import plotly.express as px
%matplotlib inline

In [118]:
train_raw_df = pd.read_csv("../../data/input/hugging_face/train_raw.csv")
train_raw_df = train_raw_df.sample(frac=1, random_state=4)


test_df = pd.read_csv("../../data/input/hugging_face/test.csv")


In [119]:
print(train_raw_df.shape)

(105, 2)


In [120]:
batch_raw = pd.concat([train_raw_df,test_df],
axis=0,
ignore_index=True)
batch_raw.to_csv("../../data/input/hugging_face/data_val/raw_train_test.csv")


In [121]:
print(batch_raw.shape)

(172, 2)


In [122]:
# For DistilBERT:
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# model_class, tokenizer_class, pretrained_weights= (ppb.RobertaModel,ppb.RobertaTokenizer,"roberta-base")
## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
#model_class, tokenizer_class, pretrained_weights = (ppb.ElectraModel, ppb.ElectraTokenizer, 'google/electra-small-discriminator')
model_names = ['distilbert-base-uncased', "roberta-base", 'bert-base-uncased','google/electra-small-discriminator']
# Load pretrained model/tokenizer
model_id = 3
model_na = model_names[model_id].split("/")[-1] if len(model_names[model_id].split("/")) > 1 else model_names[model_id]
tokenizer = AutoTokenizer.from_pretrained(model_names[model_id])
model = AutoModel.from_pretrained(model_names[model_id])
print(model_na)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


electra-small-discriminator


In [123]:
tokenized_raw = batch_raw["Question"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [124]:
max_len = 0
for i in tokenized_raw.values:
    if len(i) > max_len:
        max_len = len(i)

padded_raw = np.array([i + [0]*(max_len-len(i)) for i in tokenized_raw.values])
max_len


27

In [125]:
np.array(padded_raw).shape

(172, 27)

In [126]:
attention_mask_raw = np.where(padded_raw != 0, 1, 0)
attention_mask_raw.shape

(172, 27)

In [127]:
input_ids_raw = torch.tensor(padded_raw)  
attention_mask_raw = torch.tensor(attention_mask_raw)

with torch.no_grad():
    last_hidden_states_raw = model(input_ids_raw, attention_mask=attention_mask_raw)

In [128]:
features_raw = last_hidden_states_raw[0][:,0,:].numpy()

In [129]:
labels_raw = batch_raw["op_id"]

In [130]:
labels_raw.shape

(172,)

In [131]:
train_raw_shape = train_raw_df.shape
train_raw_features = features_raw[:train_raw_shape[0]]
test_raw_features = features_raw[train_raw_shape[0]:]
train_raw_labels = labels_raw[:train_raw_shape[0]]
test_raw_labels =labels_raw[train_raw_shape[0]:] 
np.savetxt(f"../../../../text-classification-small-datasets/datasets/train_feature_{model_na}.csv",train_raw_features,delimiter=",")
np.savetxt(f"../../../../text-classification-small-datasets/datasets/train_label_{model_na}.csv",train_raw_labels,delimiter=",")
np.savetxt(f"../../../../text-classification-small-datasets/datasets/test_feature_{model_na}.csv",test_raw_features,delimiter=",")
np.savetxt(f"../../../../text-classification-small-datasets/datasets/test_label_{model_na}.csv",test_raw_labels,delimiter=",")

In [132]:
print(train_raw_features.shape)
print(test_raw_features.shape)
print(train_raw_labels.shape)
print(test_raw_labels.shape)

(105, 256)
(67, 256)
(105,)
(67,)


In [133]:
train_eda_shape = train_eda_df.shape
train_eda_features = features_eda[:train_eda_shape[0]]
test_eda_features = features_eda[train_eda_shape[0]:]
train_eda_labels = labels_eda[:train_eda_shape[0]]
test_eda_labels =labels_eda[train_eda_shape[0]:] 

NameError: name 'train_eda_df' is not defined

In [None]:
print(train_eda_features.shape)
print(test_eda_features.shape)
print(train_eda_labels.shape)
print(test_eda_labels.shape)

In [None]:
train_raw_features.shape

In [None]:
train_eda_features.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [None]:
models = [
    LinearSVC(),
    MultinomialNB(),
    DecisionTreeClassifier(),
    MLPClassifier( solver="adam", random_state=2343, hidden_layer_sizes=(),activation="identity"),
    SVC(kernel='poly',
        random_state=494,
        degree=7
    )
]

CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model,train_raw_features, train_raw_labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

plt.figure(figsize=[12,7])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=10, jitter=True, edgecolor="gray", linewidth=2)
plt.savefig("../../figures/huggingface/test.png",bbox_inches='tight')
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
models = [
    LinearSVC(),
    MultinomialNB(),
    DecisionTreeClassifier(),
    MLPClassifier( solver="sgd", random_state=2343),
    SVC(kernel='poly',
        random_state=494,
        degree=7
    )
]

CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model,train_eda_features, train_eda_labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

plt.figure(figsize=[12,7])
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=10, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
# parameters = {'C': np.linspace(0.0001, 100, 20)}
# grid_search = GridSearchCV(LogisticRegression(), parameters)
# grid_search.fit(train_features, train_labels)

# print('best parameters: ', grid_search.best_params_)
# print('best scrores: ', grid_search.best_score_)

In [None]:
model = LinearSVC(max_iter=3000, tol=1e-2)

model.fit(train_raw_features, train_raw_labels)
y_pred = model.predict(test_raw_features)
print(y_pred)


from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(test_raw_labels, y_pred)

fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=sorted(list(set(test_raw_labels.values))), 
            yticklabels=sorted(list(set(test_raw_labels.values))),
            cmap='OrRd'
            )
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from sklearn import metrics

unique_keys = sorted(list(set(test_raw_labels.values)))
print('accuracy %s' %metrics.accuracy_score(y_pred,test_raw_labels))

In [None]:
model.score(test_raw_features,test_raw_labels)

In [None]:
model = LinearSVC(max_iter=3000,C=1,tol=1e-3)    

model.fit(train_eda_features, train_eda_labels)
y_pred = model.predict(test_eda_features)
print(y_pred)




from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(test_eda_labels, y_pred)

fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=sorted(list(set(test_raw_labels.values))), 
            yticklabels=sorted(list(set(test_raw_labels.values))),
            cmap='OrRd'
            )
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print('accuracy %s' %metrics.accuracy_score(y_pred,test_eda_labels))