In [None]:
! pip install transformers
! pip install sentencepiece

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.3 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 46.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled P

In [None]:
! wget 'https://dissent.s3-us-west-2.amazonaws.com/data/discourse_EN_FIVE_and_but_because_if_when_2017dec12_test.tsv'

--2021-08-25 18:45:42--  https://dissent.s3-us-west-2.amazonaws.com/data/discourse_EN_FIVE_and_but_because_if_when_2017dec12_test.tsv
Resolving dissent.s3-us-west-2.amazonaws.com (dissent.s3-us-west-2.amazonaws.com)... 52.218.244.193
Connecting to dissent.s3-us-west-2.amazonaws.com (dissent.s3-us-west-2.amazonaws.com)|52.218.244.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16668907 (16M) [text/tab-separated-values]
Saving to: ‘discourse_EN_FIVE_and_but_because_if_when_2017dec12_test.tsv’


2021-08-25 18:45:43 (18.8 MB/s) - ‘discourse_EN_FIVE_and_but_because_if_when_2017dec12_test.tsv’ saved [16668907/16668907]



In [None]:
from transformers import T5ForConditionalGeneration, T5EncoderModel, T5Config, T5Tokenizer
import torch
import pandas as pd
from google.colab import drive
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
drive.mount('/content/gdrive')

enc = LabelEncoder()
device = 'cuda:0'

In [None]:
discourse = pd.read_csv('discourse_EN_FIVE_and_but_because_if_when_2017dec12_test.tsv', sep='\t', header=None)
nrows = len(discourse)
discourse.columns = ['sent_1', 'sent_2', 'marker']
total_sample_size = 80000
discourse_small = discourse.groupby('marker', as_index=False).\
    apply(lambda x: x.sample(int((len(x)/nrows)*total_sample_size)))

In [None]:
TRAIN, TEST = train_test_split(discourse_small)
TRAIN['part'] = ['tr'] * len(TRAIN)
TEST['part'] = ['te'] * len(TEST)
df = TRAIN.append(TEST)

In [None]:
def load_model(checkpoint):
    m = torch.load(f'gdrive/MyDrive/model_{checkpoint}.pth')
    model = T5ForConditionalGeneration(T5Config(output_hidden_states=True))
    model.load_state_dict(m['model_state_dict'])
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model.to(device)
    return model, tokenizer

In [None]:
def get_emb(df, model, tokenizer):
  answers = []
  for i in tqdm(df.iterrows()):
    sent = i[1][:2].tolist()
    with torch.no_grad():
      enc = tokenizer(sent, padding=True, 
                    truncation=True, 
                    return_attention_mask=True,
                    return_tensors='pt')
      enc.to(device)
      output = model.encoder(**enc)
      emb = output.hidden_states
      answers.append([torch.mean(e, dim=[0, 1]).cpu().numpy() for e in emb])
  return answers

In [None]:
def classify(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    return (y_pred, accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='micro'),
    recall_score(y_test, y_pred, average='micro'), f1_score(y_test, y_pred, average='micro'))

In [None]:
def probe(epoche, df, scores, pred):
    model, tokenizer = load_model(epoche)
    print('Model is loaded')
    #sent_1 = df.iloc[:,0].apply(lambda x: get_emb(x, model, tokenizer))
    #sent_2 = df.iloc[:,1].apply(lambda x: get_emb(x, model, tokenizer))
    #X = []
    #for i in range(len(sent_1)):
    #    emb = sent_1.iloc[i] + sent_2.iloc[i]
    #    X.append([e/2 for e in emb])
    #df['mean'] = X
    TRAIN = df[df['part'] == 'tr']
    TEST = df[df['part'] == 'te']
    X_train = get_emb(TRAIN, model, tokenizer)
    X_test = get_emb(TEST, model, tokenizer)
    print('Embeddings are calculated')
    enc.fit(df.iloc[:,1])
    y = enc.fit(df['marker'])
    y_train = enc.transform(TRAIN['marker'])
    y_test = enc.transform(TEST['marker'])
    for a in tqdm(range(7)):
        train = np.array([x[a].tolist() for x in X_train])
        test = np.array([x[a].tolist() for x in X_test])
        sc = classify(train, test, y_train, y_test)
        pred.append(sc[0])
        scores.append(sc[1:])
    print('Score is calculated')
    return scores, pred

In [None]:
pred = []
scores = []
# scores, pred = probe(300000, subjnum, scores, pred)
scores, pred = probe(800000, df, scores, pred)

In [None]:
with open('pred.txt', 'a') as f:
  for i in pred:
    f.write(','.join(list(map(str, i))))
    f.write('\n')

In [None]:
scores = [[i,] + list(a) for i, a in enumerate(scores)]
sc = pd.DataFrame(scores, columns=['layer', 'accuracy', 'precision', 'recall', 'f1-score'])
sc.to_csv('scores.csv', mode='a', header=False)

In [None]:
sc