In [1]:
# Import the necessary libraries

import numpy as np
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.preprocessing import LabelEncoder
import pickle
import os
import fastai
from fastai.text import *
from fastai import *
import regex as re
import spacy
from fastai.text.core import tokenize_texts 
import collections 
from collections import Counter
import html

## Load the teacher (sarcasm model) and get predictions on S15-T11 to use them as ground truth for distillation loss

In [2]:
# Load input data

df_train = pd.read_csv("train_8k.csv")
df_trial = pd.read_csv("trial_1k.csv")


In [3]:
df_train = df_train.drop(columns = ['old id', 'new id', 'label', 'int_label'])

In [4]:
df_train.head()

Unnamed: 0,text
0,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT
1,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony
2,Having to run to the train first thing in the morning is a great way to start the day #not
3,@OmniJerBear haha should have had #sarcasm at the end
4,Really excited for these last few days of school and everything that is going to be due! #sarcasm


In [5]:
df_trial.head()

Unnamed: 0,id,label,text,int_label
0,465424601124974592,-3.0,RT @BeckyMyers3: General studies exam tomorrow and I have about as much common sense and knowledge as a peanut,-3
1,465422141643845632,-3.4,RT @TheTweetOfGod: A racist NBA owner makes about as much sense as a homophobic theater producer.,-3
2,465420676590231552,-2.8,Bit ironic Mo Farrah stars in the Weetabix advert when he shares about as much personality as a semi chipped bowl filled with half of one,-3
3,465420343344394240,-2.8,@JoshFreedman_ It is about as much an election than Katie Price was a singer.,-3
4,465414678978756609,-2.4,Just looked out the window. About as inviting as a tour of Karbul. Today is that day i 'finally' polyfilled that hole in the bathroom! Brb,-2


In [6]:
df_trial = df_trial.drop(columns = ['id', 'label', 'int_label'])

In [7]:
df_train.shape, df_trial.shape

((7985, 1), (592, 1))

In [8]:
df_train = pd.concat([df_train, df_trial], ignore_index=True)

In [9]:
df_train.shape

(8577, 1)

In [10]:
df_train.head()

Unnamed: 0,text
0,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT
1,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony
2,Having to run to the train first thing in the morning is a great way to start the day #not
3,@OmniJerBear haha should have had #sarcasm at the end
4,Really excited for these last few days of school and everything that is going to be due! #sarcasm


In [11]:
#preprocess used from #1 kernel in kaggle for sentiment140 dataset

import nltk
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
import re


HASHTAG_CLEANING_RE = "#\S+"
MENTION_CLEANING_RE = "@\S+"
TEXT_CLEANING_RE = "https?:\S+|http?:\S|[^A-Za-z0-9]+"


stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()



def preprocess(text, lemma=True):
    # Remove link,user and special characters
    text = re.sub(HASHTAG_CLEANING_RE, ' ', str(text).lower())
    text = re.sub(MENTION_CLEANING_RE, ' ', str(text).lower())
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words or token in ['not', 'can']:
            if lemma:
                tokens.append(lemmatizer.lemmatize(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

df_train.text = df_train.text.apply(lambda x: preprocess(x))



df_train.to_csv('preprocessed_tweets.csv',index=False)



In [12]:
re1 = re.compile(r'  +')
BOS = "xxbos"
FLD = "xxfld"


def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls=0):

    texts = f'\n {FLD} 1 ' + df.iloc[:,n_lbls].astype(str)

    texts = texts.apply(fixup).values.astype(str)


    tokop = tokenize_texts(texts)
    return tokop


def get_all(df, n_lbls):
    tok = []
    #import pdb
    #pdb.set_trace()
    for i, txt in enumerate(df):
        tok_ = get_texts(txt, n_lbls)
        tok += tok_
    return tok


chunksize = 24000
chunk_tweets = pd.read_csv('preprocessed_tweets.csv',chunksize=chunksize)


# the splitted words of each sentence (not numbers)
tokens = get_all(chunk_tweets, 0)


In [13]:
# Load the original vocab used for training
[df_train_2,df_valid_2,itos, train_tokens_2, valid_tokens_2, trn_lm_2, val_lm_2] = pickle.load(open('teacher - Tweets with sarcasm and irony - Binary/dfs_tokens_fastai_NEW.pkl','rb'))


# Recreate the dictionary used for training to ensure that we use same word indices as in training
stoi = collections.defaultdict(lambda: 0, { v: k for k, v in enumerate(itos) })


# recreate the sequences by replacing the words with their indices from the dictionary (stoi_1)
lm = np.array([ [stoi[o] for o in p] for p in tokens ])

# add a 'tokens' field in our dataframe with the tokenized sequences
df_train['tokens'] = lm



df_train['n_tok'] = df_train['tokens'].apply(len)



In [14]:
df_train.head()

Unnamed: 0,text,tokens,n_tok
0,love working 6 5 hour without break anything especially period awful cramp,"[3, 4, 2, 10, 188, 147, 57, 103, 197, 335, 257, 928, 2251, 1308, 7378]",15
1,happy song not invoke good feeling actually quite extremely annoying,"[3, 4, 2, 117, 224, 1004, 0, 14, 307, 153, 719, 2472, 1705]",13
2,run train first thing morning great way start day,"[3, 4, 2, 164, 342, 52, 34, 99, 19, 36, 91, 9]",12
3,haha end,"[3, 4, 2, 291, 149]",5
4,really excited last day school everything going due,"[3, 4, 2, 26, 388, 61, 9, 68, 191, 43, 635]",11


In [15]:
# Padding the sequences to have same length input tweets
padlen=33 #use the same padlen as in training 
padding_idx=1

def pad (x, padlen, padding_idx):
    out=np.ones(padlen)*padding_idx
    out=out.astype(np.int64)
    if len(x)>=padlen:
        out[:]=x[:padlen]
    else:
        out[:len(x)]=x
    return out

df_train.tokens = df_train.tokens.apply(lambda x: pad(x, padlen, padding_idx))


df_train.loc[df_train['n_tok'] > padlen, ['n_tok']] = padlen



In [16]:
# We must define model's class first , to be able to load it.

n_inp=len(itos)
n_emb=200 #650
n_hidden=200#400
n_layers= 2 # 2
dropout=0.5 # 0.5
wd=1e-7
bidirectional=True
dropout_e=0.2 # 0.5 - changing to 0.4, 0.3 or any dropout value did not make much difference
dropout_o=0.5 #0.5
n_out=1


class sentiment_classifier (nn.Module):
    def __init__(self,n_inp,n_emb,n_hidden,n_layers,bidirectional,bs,device,dropout_e=0.05,dropout=0.5,\
                 dropout_o=0.5,pretrain_mtx=None,n_out=1,padding_idx=1,n_filters=100,filter_sizes=[3,4,5]):
        super().__init__()
        self.n_inp,self.n_emb,self.n_hidden,self.n_layers,self.bidirectional,self.bs,self.device,self.pretrain_mtx,self.padding_idx=\
                            n_inp,n_emb,n_hidden,n_layers,bidirectional,bs,device,pretrain_mtx,padding_idx
        self.n_out,self.n_filters,self.filter_sizes=n_out,n_filters,filter_sizes
        self.dropout_e,self.dropout,self.dropout_o=dropout_e,dropout,dropout_o

        self.create_architecture()
        if pretrain_mtx is not None:
            print (f'initializing glove with {pretrain_mtx.shape}')
            self.initialize_glove()
        self.init_hidden()
        self.criterion=nn.BCEWithLogitsLoss()

    def set_dropouts(self, dropout, dropout_o, dropout_e):
        self.dropout, self.dropout_o, self.dropout_e = dropout, dropout_o, dropout_e


    def freeze_embedding(self):
        self.encoder.weight.requires_grad=False

    def unfreeze_embedding(self):
        self.encoder.weight.requires_grad=True

    def initialize_glove(self):
        self.encoder.weight.data.copy_(torch.Tensor(self.pretrain_mtx))

    def init_hidden(self):
        # Initialize hidden
        self.hidden=(Variable(torch.zeros(self.n_layers,self.bs,self.n_hidden,requires_grad=False).to(self.device)),
                     Variable(torch.zeros(self.n_layers,self.bs,self.n_hidden,requires_grad=False).to(self.device)))


    def create_architecture(self):
        ###################################
        # Embedding layer - common to both
        ###################################
        self.dropout_enc=nn.Dropout(self.dropout_e)
        self.encoder=nn.Embedding(self.n_inp,self.n_emb,padding_idx=self.padding_idx)

        #######################################
        # For RNN #############################
        #######################################
        # Embedding Layer: Embedding layer just maps each word to an index. n_inp to n_emb mapping is all it does
            # input to this is of shape n_batch * n_seq
         # LSTM Layer
        self.lstm=nn.LSTM(self.n_emb,self.n_hidden,self.n_layers,batch_first=True,dropout=self.dropout,\
                          bidirectional=self.bidirectional)
          # embs are going to be of shape n_batch * n_seq * n_emb
        self.dropout_op=nn.Dropout(self.dropout_o)

        self.avg_pool1d=torch.nn.AdaptiveAvgPool1d(1)
        self.max_pool1d=torch.nn.AdaptiveMaxPool1d(1)


        #######################################
        # For CNN #############################
        #######################################    
        #embedding dimension is the "depth" of the filter and the number of tokens in the sentence is the width.
        self.conv_0=torch.nn.Conv1d (self.n_emb,self.n_filters,kernel_size=self.filter_sizes[0])
        self.conv_1=torch.nn.Conv1d (self.n_emb,self.n_filters,kernel_size=self.filter_sizes[1])
        self.conv_2=torch.nn.Conv1d(self.n_emb,self.n_filters,kernel_size=self.filter_sizes[2])

        self.fc=nn.Linear(len(self.filter_sizes)*self.n_filters+self.n_hidden*4,self.n_out)



    def forward (self,Xb,Xb_lengths):

        ####RNN PORTION
        embs=self.dropout_enc(self.encoder(Xb))
        if Xb.size(0) < self.bs:
            self.hidden=(self.hidden[0][:,:Xb.size(0),:].contiguous(),
            self.hidden[1][:,:Xb.size(0),:].contiguous())
        packed_embs = pack_padded_sequence(embs,Xb_lengths.cpu(),batch_first=True, enforce_sorted=False)
        lstm_out,(hidden,cell)=self.lstm(packed_embs)
        lstm_out,lengths=pad_packed_sequence(lstm_out,batch_first=True)
        hidden = self.dropout_op(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        avg_pool=self.avg_pool1d(lstm_out.permute(0,2,1)).view(Xb.size(0),-1)
        max_pool=self.max_pool1d(lstm_out.permute(0,2,1)).view(Xb.size(0),-1)

        #CNN Portion
        new_embs=embs.permute(0,2,1)        
        conved_0=torch.relu(self.conv_0(new_embs))
        conved_1=torch.relu(self.conv_1(new_embs))
        conved_2=torch.relu(self.conv_2(new_embs)) 
        max_pool1d=torch.nn.MaxPool1d(conved_0.shape[2])
        pooled_0=max_pool1d(conved_0).squeeze(2)
        max_pool1d=torch.nn.MaxPool1d(conved_1.shape[2])
        pooled_1=max_pool1d(conved_1).squeeze(2)
        max_pool1d=torch.nn.MaxPool1d(conved_2.shape[2])
        pooled_2=max_pool1d(conved_2).squeeze(2)
        cat_cnn = self.dropout_op(torch.cat([pooled_0,pooled_1,pooled_2],dim=1))

        ## Concatenate
        big_out=torch.cat([cat_cnn,hidden,max_pool],dim=1)
        preds=self.fc(big_out)

        preds = torch.sigmoid(preds.view(-1))


        return preds


In [17]:
def split_dataframe(df, chunk_size = 100): 
        chunks = list()
        num_chunks = len(df) // chunk_size + 1
        for i in range(num_chunks):
            chunks.append(df[i*chunk_size:(i+1)*chunk_size])
        return chunks


In [18]:
# Load the model

COMBO_PATH = "teacher - Tweets with sarcasm and irony - Binary/tuned saves"

model_sentiment = torch.load (f'{COMBO_PATH}/model_sentiment')

device = "cuda:0"
model_sentiment = model_sentiment.to(device)


iterable = split_dataframe(df_train)


# Get predictions on our new data

y_pred_teacher = np.zeros(df_train.shape[0])
k=0

for data in iterable:
    data = data.reset_index(drop=True)
    x = data['tokens']
    x_len = data['n_tok']

    x = torch.tensor(x)
    x_len = torch.tensor(x_len)

    x = x.to(device)
    x_len = x_len.to(device)

    y_pred = model_sentiment(x, x_len)

    y_pred = y_pred.to("cpu")
    y_pred = y_pred.detach().numpy()

    y_pred_teacher[k:k+100] = y_pred
    k+=100

    del x
    del x_len

    torch.cuda.empty_cache()



In [19]:
y_pred_teacher

array([0.07070994, 0.40288487, 0.32499784, ..., 0.11287053, 0.16063896,
       0.1547837 ])

In [20]:
np.savetxt('teacher_labels.csv', y_pred_teacher, delimiter=',')

# Start distillation from here

## Load the teacher ground truth (used for distillation loss)

In [2]:
teacher_labels = np.loadtxt('teacher_labels.csv', delimiter=',')

In [3]:
teacher_labels

array([0.07070994, 0.40288487, 0.32499784, ..., 0.11287053, 0.16063896,
       0.1547837 ])

## Setup the student for training (potamias model)

In [4]:
# Initialize the roberta tokenizer and model 

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

In [5]:
device="cuda:0"
roberta_model = roberta_model.to(device)

In [6]:
# Load input data


df_train = pd.read_csv("train_8k.csv")
df_valid = pd.read_csv("test_4k.csv")
df_trial = pd.read_csv("trial_1k.csv")


In [7]:
df_train.head()

Unnamed: 0,old id,new id,label,text,int_label
0,472189928340606976,519632796449378304,-3.99,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT,-4
1,472440774785650688,519632825167773696,-3.92,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony,-4
2,473085653454827520,519632853982650370,-2.22,Having to run to the train first thing in the morning is a great way to start the day #not,-2
3,463445012374499328,519632882940129280,-0.56,@OmniJerBear haha should have had #sarcasm at the end,-1
4,463501257110724610,519632911473987584,-1.27,Really excited for these last few days of school and everything that is going to be due! #sarcasm,-1


In [8]:
df_train.shape

(7985, 5)

In [9]:
df_train = df_train.drop(columns = ['old id', 'new id', 'label'])

In [10]:
df_train['int_label'].value_counts()

-3    2966
-2    2931
-1     860
-4     363
 0     344
 2     195
 1     163
 3     106
 4      49
-5       6
 5       2
Name: int_label, dtype: int64

In [11]:
df_train.rename(columns={'int_label': 'label'}, inplace=True)
df_train = df_train[["label", "text"]]

In [12]:
df_train.head()

Unnamed: 0,label,text
0,-4,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT
1,-4,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony
2,-2,Having to run to the train first thing in the morning is a great way to start the day #not
3,-1,@OmniJerBear haha should have had #sarcasm at the end
4,-1,Really excited for these last few days of school and everything that is going to be due! #sarcasm


In [13]:
df_valid.head()

Unnamed: 0,id,label,category,text
0,5.376513e+17,-3,sarcasm,So great to come back to my dorm and find that my roommate rearranged my things for me. How sweet. #sarcasm #PISSED
1,5.383325e+17,-2,sarcasm,If jean howie my neighbour is at my mums wedding it will just make the whole day cause she really likes me #sarcasm
2,5.380508e+17,-3,sarcasm,"@KTHopkins @MissKatiePrice LOL@ katie hopkins u can talk, shagging married men is your forté isn't it? Great person 2 point the finger #NOT"
3,5.380175e+17,-3,sarcasm,"@stuarteagle QPR? They looked terrible yesterday. Ferdinand, what a player #Not"
4,5.379588e+17,0,sarcasm,"Next! Jamie Foxx ft. 2 Chainz ""Party Ain't a Party"" - Tune in and Tweet us #HOT or #NOT!!"


In [14]:
df_valid = df_valid.drop(columns = ['id', 'category'])

In [15]:
df_trial.head()

Unnamed: 0,id,label,text,int_label
0,465424601124974592,-3.0,RT @BeckyMyers3: General studies exam tomorrow and I have about as much common sense and knowledge as a peanut,-3
1,465422141643845632,-3.4,RT @TheTweetOfGod: A racist NBA owner makes about as much sense as a homophobic theater producer.,-3
2,465420676590231552,-2.8,Bit ironic Mo Farrah stars in the Weetabix advert when he shares about as much personality as a semi chipped bowl filled with half of one,-3
3,465420343344394240,-2.8,@JoshFreedman_ It is about as much an election than Katie Price was a singer.,-3
4,465414678978756609,-2.4,Just looked out the window. About as inviting as a tour of Karbul. Today is that day i 'finally' polyfilled that hole in the bathroom! Brb,-2


In [16]:
df_trial = df_trial.drop(columns = ['id', 'label'])

In [17]:
df_trial.rename(columns={'int_label': 'label'}, inplace=True)
df_trial = df_trial[["label", "text"]]

In [18]:
df_train.shape, df_trial.shape, df_valid.shape

((7985, 2), (592, 2), (3957, 2))

In [19]:
df_train = pd.concat([df_train, df_trial], ignore_index=True)

In [20]:
df_train.shape, df_valid.shape

((8577, 2), (3957, 2))

In [21]:
df_train.head()

Unnamed: 0,label,text
0,-4,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT
1,-4,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony
2,-2,Having to run to the train first thing in the morning is a great way to start the day #not
3,-1,@OmniJerBear haha should have had #sarcasm at the end
4,-1,Really excited for these last few days of school and everything that is going to be due! #sarcasm


In [22]:
df_valid.head()

Unnamed: 0,label,text
0,-3,So great to come back to my dorm and find that my roommate rearranged my things for me. How sweet. #sarcasm #PISSED
1,-2,If jean howie my neighbour is at my mums wedding it will just make the whole day cause she really likes me #sarcasm
2,-3,"@KTHopkins @MissKatiePrice LOL@ katie hopkins u can talk, shagging married men is your forté isn't it? Great person 2 point the finger #NOT"
3,-3,"@stuarteagle QPR? They looked terrible yesterday. Ferdinand, what a player #Not"
4,0,"Next! Jamie Foxx ft. 2 Chainz ""Party Ain't a Party"" - Tune in and Tweet us #HOT or #NOT!!"


In [23]:
myle = LabelEncoder()

In [24]:
df_train['label'] = myle.fit_transform(df_train['label'])

df_valid['label'] = myle.fit_transform(df_valid['label'])

In [25]:
df_train['label'].value_counts()

2     3191
3     3067
4      925
1      410
5      377
7      218
6      196
8      126
9       56
0        8
10       3
Name: label, dtype: int64

In [26]:
df_valid['label'].value_counts() 

3     1530
2      730
4      671
5      293
8      201
6      164
7      150
9      111
1       99
10       4
0        4
Name: label, dtype: int64

In [27]:
tweets_train = df_train['text']
tweets_valid = df_valid['text']

tweets_train = tweets_train.tolist()
tweets_valid = tweets_valid.tolist()

In [28]:
tokens_train = tokenizer(tweets_train, truncation=True)
tokens_valid = tokenizer(tweets_valid, truncation=True)

In [29]:
df_train['tokens'] = tokens_train['input_ids']
df_valid['tokens'] = tokens_valid['input_ids']

In [30]:
df_train['n_tok'] = df_train['tokens'].apply(len)
df_valid['n_tok'] = df_valid['tokens'].apply(len)

df_train['n_tok'].describe()

count    8577.000000
mean       29.213361
std        10.932552
min         8.000000
25%        22.000000
50%        28.000000
75%        35.000000
max       512.000000
Name: n_tok, dtype: float64

In [31]:
df_train.head()

Unnamed: 0,label,text,tokens,n_tok
0,1,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT,"[0, 100, 95, 657, 447, 13, 231, 4, 245, 722, 396, 10, 1108, 50, 932, 4, 17570, 77, 38, 437, 15, 127, 675, 8, 33, 11522, 3977, 9782, 4, 849, 37049, 2]",32
1,1,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony,"[0, 133, 1372, 2214, 473, 45, 32550, 205, 6453, 4, 85, 18, 888, 1341, 2778, 19887, 4, 849, 853, 6119, 2]",21
2,3,Having to run to the train first thing in the morning is a great way to start the day #not,"[0, 15852, 7, 422, 7, 5, 2341, 78, 631, 11, 5, 662, 16, 10, 372, 169, 7, 386, 5, 183, 849, 3654, 2]",23
3,4,@OmniJerBear haha should have had #sarcasm at the end,"[0, 1039, 673, 119, 5107, 25786, 40237, 46116, 197, 33, 56, 849, 29, 9636, 16836, 23, 5, 253, 2]",19
4,4,Really excited for these last few days of school and everything that is going to be due! #sarcasm,"[0, 30327, 2283, 13, 209, 94, 367, 360, 9, 334, 8, 960, 14, 16, 164, 7, 28, 528, 328, 849, 29, 9636, 16836, 2]",24


In [32]:
type(teacher_labels)

numpy.ndarray

In [33]:
teacher_labels = pd.Series(teacher_labels)

df_train['teacher_labels'] = teacher_labels
df_valid['teacher_labels'] = teacher_labels

In [34]:
df_train.head()

Unnamed: 0,label,text,tokens,n_tok,teacher_labels
0,1,I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT,"[0, 100, 95, 657, 447, 13, 231, 4, 245, 722, 396, 10, 1108, 50, 932, 4, 17570, 77, 38, 437, 15, 127, 675, 8, 33, 11522, 3977, 9782, 4, 849, 37049, 2]",32,0.07071
1,1,The happy song does not invoke good feelings. It's actually quite extremely annoying. #irony,"[0, 133, 1372, 2214, 473, 45, 32550, 205, 6453, 4, 85, 18, 888, 1341, 2778, 19887, 4, 849, 853, 6119, 2]",21,0.402885
2,3,Having to run to the train first thing in the morning is a great way to start the day #not,"[0, 15852, 7, 422, 7, 5, 2341, 78, 631, 11, 5, 662, 16, 10, 372, 169, 7, 386, 5, 183, 849, 3654, 2]",23,0.324998
3,4,@OmniJerBear haha should have had #sarcasm at the end,"[0, 1039, 673, 119, 5107, 25786, 40237, 46116, 197, 33, 56, 849, 29, 9636, 16836, 23, 5, 253, 2]",19,0.228498
4,4,Really excited for these last few days of school and everything that is going to be due! #sarcasm,"[0, 30327, 2283, 13, 209, 94, 367, 360, 9, 334, 8, 960, 14, 16, 164, 7, 28, 528, 328, 849, 29, 9636, 16836, 2]",24,0.295919


In [35]:
df_train['text'][0]

"I just love working for 6.5 hours without a break or anything. Especially when I'm on my period and have awful cramps. #NOT"

In [36]:
df_valid.head()

Unnamed: 0,label,text,tokens,n_tok,teacher_labels
0,2,So great to come back to my dorm and find that my roommate rearranged my things for me. How sweet. #sarcasm #PISSED,"[0, 2847, 372, 7, 283, 124, 7, 127, 18344, 8, 465, 14, 127, 25537, 37060, 17770, 127, 383, 13, 162, 4, 1336, 4045, 4, 849, 29, 9636, 16836, 849, 510, 17588, 1691, 2]",33,0.07071
1,3,If jean howie my neighbour is at my mums wedding it will just make the whole day cause she really likes me #sarcasm,"[0, 1106, 1236, 12001, 141, 324, 127, 14915, 16, 23, 127, 475, 8014, 3312, 24, 40, 95, 146, 5, 1086, 183, 1303, 79, 269, 3829, 162, 849, 29, 9636, 16836, 2]",31,0.402885
2,2,"@KTHopkins @MissKatiePrice LOL@ katie hopkins u can talk, shagging married men is your forté isn't it? Great person 2 point the finger #NOT","[0, 1039, 530, 3732, 1517, 7327, 787, 22885, 27029, 324, 36677, 39687, 1039, 449, 415, 324, 13591, 7327, 1717, 64, 1067, 6, 1481, 12771, 2997, 604, 16, 110, 15016, 1140, 965, 75, 24, 116, 2860, 621, 132, 477, 5, 8411, 849, 37049, 2]",43,0.324998
3,2,"@stuarteagle QPR? They looked terrible yesterday. Ferdinand, what a player #Not","[0, 1039, 620, 41962, 242, 21851, 1209, 4454, 116, 252, 1415, 6587, 2350, 4, 28855, 6, 99, 10, 869, 849, 7199, 2]",22,0.228498
4,5,"Next! Jamie Foxx ft. 2 Chainz ""Party Ain't a Party"" - Tune in and Tweet us #HOT or #NOT!!","[0, 19192, 328, 6541, 2063, 1178, 16935, 4, 132, 18610, 329, 22, 38210, 32431, 75, 10, 1643, 113, 111, 27879, 11, 8, 12244, 201, 849, 725, 3293, 50, 849, 37049, 12846, 2]",32,0.295919


In [37]:
class ds_sentiment:
    def __init__ (self,df,bs,padlen=64,xvar='tokens',yvar='label',len_var='n_tok',y_teach='teacher_labels',padding_idx=1):
        self.x=df[xvar]
        self.y=df[yvar]
        self.y_teach=df[y_teach]
        self.padlen=padlen
        self.padding_idx=padding_idx
        self.len_var=df[len_var]
        self.bs=bs
    
        self.len_var=self.len_var.clip(0,padlen)
    
    def pad (self,x):
        out=np.ones(self.padlen)*self.padding_idx
        out=out.astype(np.int64)
        if len(x)>=self.padlen:
            out[:]=x[:self.padlen]
        else:
            out[:len(x)]=x
        return out
        
    def __getitem__(self,idx):
        #import pdb
        #pdb.set_trace()
        return self.pad(self.x.iloc[idx]),self.y.iloc[idx],self.len_var.iloc[idx],self.y_teach.iloc[idx]
    
    def __len__(self):
        return len(self.x)

In [38]:
bs = 10
bptt= 70
padlen = 50

df_train.loc[df_train['n_tok'] > padlen, ['n_tok']] = padlen
df_valid.loc[df_valid['n_tok'] > padlen, ['n_tok']] = padlen

df_train['n_tok'].describe()

count    8577.000000
mean       29.039408
std         9.143004
min         8.000000
25%        22.000000
50%        28.000000
75%        35.000000
max        50.000000
Name: n_tok, dtype: float64

In [39]:
dstrain=ds_sentiment(df_train,bs,padlen)

In [40]:
dsvalid=ds_sentiment(df_valid,bs,padlen)

In [41]:
dltrain = DataLoader(dstrain,bs,True)
dlvalid = DataLoader(dsvalid,bs,False)

In [42]:
for xb,yb,xlen,yb_teach in dltrain:
    break

In [43]:
xb, yb, xlen, yb_teach

(tensor([[    0,  2527,   203,  1531,   849,    29,  9636, 16836,  2054,   640,
             90,     4,   876,    73,   246,  4148,  3998,   530,    29,  1000,
            530,   438,     2,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
         [    0,   534, 16037,   657,    77,    82,   283,    62,     8,  1137,
             47,    59,   643,     4,   849,  3654,     2,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
         [    0,   771,  4057,   216,   402,   116,    38,  4157,  7739,    13,
          15734,     4,   849,  1193, 23145,   849,  3654,     2,     1,     1,
              1,     1,     1,     1, 

In [44]:
n_emb=768 #650
n_hidden=64 #400
n_layers= 2 # 2
dropout=0.1 # 0.5
wd=1e-5
bidirectional=True
dropout_e=0.2 # 0.5 - changing to 0.4, 0.3 or any dropout value did not make much difference
dropout_o=0.1 #0.5
n_out=11

In [45]:
class student_classifier (nn.Module):
    def __init__(self,roberta_model,n_emb,n_hidden,n_layers,bidirectional,bs,device,dropout_e=0.05,dropout=0.5,\
                 dropout_o=0.5,n_out=11,n_filters=100,filter_sizes=[3,4,5]):
        super().__init__()
        self.roberta_model,self.n_emb,self.n_hidden,self.n_layers,self.bidirectional,self.bs,self.device=\
                            roberta_model,n_emb,n_hidden,n_layers,bidirectional,bs,device
        self.n_out,self.n_filters,self.filter_sizes=n_out,n_filters,filter_sizes
        self.dropout_e,self.dropout,self.dropout_o=dropout_e,dropout,dropout_o
        
        self.create_architecture()
        self.init_hidden()
        self.criterion=nn.CrossEntropyLoss()
        self.distil_criterion=nn.MSELoss()
    
    def set_dropouts(self, dropout, dropout_o, dropout_e):
        self.dropout, self.dropout_o, self.dropout_e = dropout, dropout_o, dropout_e
    
    
    def freeze_embedding(self):
        
        for param in self.encoder.parameters():
            param.requires_grad = False
    
         
    def unfreeze_embedding(self):
        
        for param in self.encoder.parameters():
            param.requires_grad = True
    
    def init_hidden(self):
        # Initialize hidden
        self.hidden=(Variable(torch.zeros(self.n_layers,self.bs,self.n_hidden,requires_grad=False).to(self.device)),
                     Variable(torch.zeros(self.n_layers,self.bs,self.n_hidden,requires_grad=False).to(self.device)))
    

    def create_architecture(self):
        
        #self.dropout_enc = nn.Dropout(self.dropout_e)
        self.encoder = self.roberta_model
        
        
        # LSTM Layer
        self.lstm = nn.LSTM(self.n_emb,self.n_hidden,self.n_layers,batch_first=True,dropout=self.dropout,\
                          bidirectional=self.bidirectional)
        
        # embs are going to be of shape n_batch * n_seq * n_emb
        #self.dropout_op = nn.Dropout(self.dropout_o)
        
        self.max_pool1d = torch.nn.MaxPool1d(50, stride=1)
        
        self.flat = nn.Flatten()
        

        self.project = nn.Linear(896,64)
        
        self.fc = nn.Linear(64,self.n_out)
        
        self.paralel = nn.Linear(64,1)

        
    def forward (self,Xb,Yb,Xb_lengths,Yb_teach):
        
        ####RNN PORTION
        roberta_out = self.encoder(Xb)
        last_hidden_states = roberta_out.last_hidden_state
        embs = last_hidden_states
        #print('embs : ', embs.shape)
        
        
        #packed_embs = pack_padded_sequence(embs,Xb_lengths.cpu(),batch_first=True, enforce_sorted=False)
        
        
        lstm_out,(hidden,cell)=self.lstm(embs)
        #print('lstm_out : ', lstm_out.shape)
        
        
        #lstm_out,lengths=pad_packed_sequence(lstm_out,batch_first=True)
        
        
        
        ## Concatenate
        catted = torch.cat([embs.permute(0,2,1),lstm_out.permute(0,2,1)],dim=1)
        #print('catted : ', catted.shape)
        
        
        ## Pooling
        max_pool = self.max_pool1d(catted)
        #print('max_pool : ', max_pool.shape)
        
        
        ## Project to latent vectors
        latent = self.project(self.flat(max_pool))
        #print('latent : ', latent.shape)
        
        
        ## Reshape
        #ok = max_pool.permute(0,2,1)
        #ok = ok.reshape(ok.size(0),ok.size(1)*ok.size(2))
        #print('ok : ', ok.shape)
        
        
        #Final output
        student_preds = self.fc(latent)
        
        distil_preds = torch.sigmoid(self.paralel(latent))
        
        distil_preds=torch.flatten(distil_preds)


        student_loss = self.criterion(student_preds,Yb.contiguous().long().view(-1))
        
        distil_loss = self.distil_criterion(distil_preds,Yb_teach.contiguous().float().view(-1))
        
        
        final_loss = student_loss + distil_loss

        
        return student_preds,final_loss

In [46]:
def accuracy_multinomial(preds, actual, device="cpu", cutoff=0.5):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    ela=F.softmax(preds, dim=1)
    preds=ela.max(1)[1]
    correct=preds==actual 
    acc = correct.float().sum()/len(correct)
    return acc


In [47]:
class Learner():
    def __init__(self,model,optimizer,metric_fn,device,bptt=12,print_every=5,clip_val=None,\
                 cycle_mult=0,lr_decay=1,wd_mult=1):
        self.model,self.optimizer,self.metric_fn,self.device,self.print_every,self.bptt,self.losses,self.clip_val=\
            model,optimizer,metric_fn,device,print_every,bptt,[],clip_val
        self.n_epochs=1
        self.cycle_mult,self.lr_decay=cycle_mult,lr_decay
        self.wd_mult=wd_mult
        for param_group in self.optimizer.param_groups:
            self.start_lr=param_group['lr']
            self.start_wd=param_group['weight_decay']
        self.wd=self.start_wd
        self.lr=self.start_lr
        self.n_epoch=0
        self.lrs=[1e-2,5e-3,1e-4,5e-4]
        self.preds,self.preds_valid,self.trainY,self.actual=[],[],[],[]
        
    def fit (self,Xb,Yb,Xlen,Yb_teach,mode_train=True):
        if mode_train:
            self.model.train()
        else:
            self.model.eval()
            
        preds,loss=self.model(Xb,Yb,Xlen,Yb_teach)
        
        with torch.no_grad():
            acc=self.metric_fn(preds,Yb.view(-1),self.device)
            acc=acc.item()
            
            
            if mode_train:
                self.trainY.append(Yb.view(-1))
                self.preds.append(preds.data)
            else:
                self.actual.append(Yb.view(-1))
                self.preds_valid.append(preds.data)

            
            del preds
        
        if mode_train:
            if 1==0:
                lr =self.lrs[torch.randint(0,4,(1,))]
                for param_group in self.optimizer.param_groups:
                    param_group['lr']=lr
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        myloss=loss.item()
        del loss
        
        if self.clip_val is not None:
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_val)
        
        return myloss, acc
    
    def lr_find (self,start_lr,end_lr,iterator,n_batch):
        losses,lrs=[],[]
        ratio=end_lr/start_lr
        num_steps=n_batch
        lr=start_lr
        for i in range(num_steps):            
            lr=lr*(end_lr/start_lr)**(1/num_steps)
            lrs.append(lr)
        self.lrs=lrs
        self.run_epoch(iterator,mode_train=True,lrs=lrs)
    
    def run_epoch(self,iterator,mode_train,lrs=None):
        epoch_loss,epoch_acc,i,k=0,0,0,0
        self.model.init_hidden()
        for Xb,Yb,Xlen,Yb_teach in iterator:
            Xb=Xb.to(self.device)
            Yb=Yb.to(self.device)
            Xlen=Xlen.to(self.device)
            Yb_teach=Yb_teach.to(self.device)
            
            if lrs is not None:
                lr=lrs[k]
                for param_group in self.optimizer.param_groups:
                    param_group['lr']=lr 
            

            loss,acc=self.fit(Xb,Yb,Xlen,Yb_teach,mode_train)
            
            if lrs is not None:
                self.losses.append(loss)
            
            
            epoch_loss+=loss
            epoch_acc+=acc
            
            k=k+1
            if k%self.print_every == 0:
                if k:
                    print (f'Batch:{k} {epoch_loss/(k)}  {epoch_acc/(k)}')  
                    torch.cuda.empty_cache()
        epoch_loss=epoch_loss/len(iterator)
        epoch_acc=epoch_acc/len(iterator)
            
        return epoch_loss,epoch_acc
    
    def plot_lrs(self, n_roll=1):
        import seaborn as sns
        ax=sns.lineplot(x=self.lrs,y=pd.Series(self.losses).rolling(n_roll).mean())
        ax.set_xscale('log')
        ax.set_ylabel('Loss')
        ax.set_xlabel('Learning Rate')

     
    def run_epochs(self,dltrain,dlvalid,n_epochs=1):
        
        if self.cycle_mult > 0:
            reset_cycle=self.cycle_mult
        
        for epoch in range(n_epochs):                

            
            loss,acc=self.run_epoch(dltrain,True)
            lossv,accv=self.run_epoch(dlvalid,mode_train=False)
            print (f'Epoch:{epoch} Learning rate {self.lr} Weight Decay {self.wd} Train Loss:{loss} Train Accuracy:{acc} Valid Loss:{lossv} Valid Accuracy:{accv}')
        
            if self.cycle_mult:
                if self.n_epoch==reset_cycle:
                    self.lr=self.start_lr
                    #self.wd=self.start_wd
                    reset_cycle=self.n_epoch+reset_cycle
                else:
                    self.lr*=(self.lr_decay**self.n_epoch)  
                    if self.n_epoch>1:
                        self.wd*=self.wd_mult
            self.n_epoch+=1
                
                
            for param_group in self.optimizer.param_groups:
                param_group['lr']=self.lr
                #param_group['weight_decay']=self.wd

In [48]:
device="cuda:0"

In [49]:
model_sentiment=student_classifier (roberta_model,n_emb,n_hidden,n_layers,bidirectional,bs,device,dropout_e,dropout,\
                 dropout_o,n_out=11)
model_sentiment=model_sentiment.to(device)

In [50]:
def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model_sentiment):,} trainable parameters')

The model has 125,230,156 trainable parameters


In [51]:
optimizer = torch.optim.Adam(model_sentiment.parameters(),lr=2e-5, eps=1e-6, weight_decay=wd)

In [52]:
learner = Learner(model_sentiment,optimizer,accuracy_multinomial,device,bptt,100,0.25,cycle_mult=10)

In [53]:
learner.lr_decay, learner.wd_mult

(1, 1)

In [54]:
#model_sentiment.freeze_embedding()

In [55]:
#model_sentiment.unfreeze_embedding()

In [117]:
learner.run_epochs(dltrain,dlvalid,1)

Batch:100 0.6315232857316733  0.793000015616417
Batch:200 0.6088467305526137  0.7930000133812427
Batch:300 0.6209595644225677  0.7856666786472003
Batch:400 0.6147455154173076  0.7907500125467777
Batch:500 0.6262140686362981  0.7852000125646591
Batch:600 0.6347145994131764  0.7820000124971072
Batch:700 0.6363866459152528  0.7827142980269023
Batch:800 0.6457234652619809  0.7765000122599304
Batch:100 1.5428961971402169  0.49500000581145287
Batch:200 1.9741957773268224  0.39000000566244125
Batch:300 2.0591005207101505  0.36233333870768547
Epoch:0 Learning rate 2e-05 Weight Decay 1e-05 Train Loss:0.6513372115651905 Train Accuracy:0.7737928862511972 Valid Loss:2.0752233863629477 Valid Accuracy:0.34931457987186887


## Get the cosine similarity metric via Semeval's script

In [57]:
df_test = pd.read_csv("test_3957_preprocessed.tsv", sep='\t')

In [58]:
df_test.head()

Unnamed: 0,id,label,category,text
0,537651335752323073,-3,sarcasm,great come back dorm find roommate rearranged thing sweet
1,538332513408937986,-2,sarcasm,jean howie neighbour mum wedding make whole day cause really like
2,538050779824025600,-3,sarcasm,lol katie hopkins u talk shagging married men fort great person 2 point finger
3,538017499724279809,-3,sarcasm,qpr looked terrible yesterday ferdinand player
4,537958766910926848,0,sarcasm,next jamie foxx ft 2 chainz party party tune tweet u


In [59]:
df_valid.head()

Unnamed: 0,label,text,tokens,n_tok,teacher_labels
0,2,So great to come back to my dorm and find that my roommate rearranged my things for me. How sweet. #sarcasm #PISSED,"[0, 2847, 372, 7, 283, 124, 7, 127, 18344, 8, 465, 14, 127, 25537, 37060, 17770, 127, 383, 13, 162, 4, 1336, 4045, 4, 849, 29, 9636, 16836, 849, 510, 17588, 1691, 2]",33,0.07071
1,3,If jean howie my neighbour is at my mums wedding it will just make the whole day cause she really likes me #sarcasm,"[0, 1106, 1236, 12001, 141, 324, 127, 14915, 16, 23, 127, 475, 8014, 3312, 24, 40, 95, 146, 5, 1086, 183, 1303, 79, 269, 3829, 162, 849, 29, 9636, 16836, 2]",31,0.402885
2,2,"@KTHopkins @MissKatiePrice LOL@ katie hopkins u can talk, shagging married men is your forté isn't it? Great person 2 point the finger #NOT","[0, 1039, 530, 3732, 1517, 7327, 787, 22885, 27029, 324, 36677, 39687, 1039, 449, 415, 324, 13591, 7327, 1717, 64, 1067, 6, 1481, 12771, 2997, 604, 16, 110, 15016, 1140, 965, 75, 24, 116, 2860, 621, 132, 477, 5, 8411, 849, 37049, 2]",43,0.324998
3,2,"@stuarteagle QPR? They looked terrible yesterday. Ferdinand, what a player #Not","[0, 1039, 620, 41962, 242, 21851, 1209, 4454, 116, 252, 1415, 6587, 2350, 4, 28855, 6, 99, 10, 869, 849, 7199, 2]",22,0.228498
4,5,"Next! Jamie Foxx ft. 2 Chainz ""Party Ain't a Party"" - Tune in and Tweet us #HOT or #NOT!!","[0, 19192, 328, 6541, 2063, 1178, 16935, 4, 132, 18610, 329, 22, 38210, 32431, 75, 10, 1643, 113, 111, 27879, 11, 8, 12244, 201, 849, 725, 3293, 50, 849, 37049, 12846, 2]",32,0.295919


In [60]:
df_test['label'] = df_test['label']+5

In [61]:
(df_test['label'] == df_valid['label']).value_counts()

True    3957
Name: label, dtype: int64

In [62]:
df_test.shape

(3957, 4)

In [63]:
ids = df_test['id']

## Catch up here to update the outputs while training

In [118]:
proper_preds_valid = [item.max(1)[1] for item in learner.preds_valid]

In [119]:
#find the outputs of our model for the test data

from itertools import chain

preds_valid=list(chain.from_iterable(proper_preds_valid))[-df_valid.shape[0]:]
actual=list(chain.from_iterable(learner.actual))[-df_valid.shape[0]:]

preds_valid=[x.item() for x in preds_valid]
actual=[x.item() for x in actual]

In [120]:
df_valid.head()

Unnamed: 0,label,text,tokens,n_tok,teacher_labels
0,2,So great to come back to my dorm and find that my roommate rearranged my things for me. How sweet. #sarcasm #PISSED,"[0, 2847, 372, 7, 283, 124, 7, 127, 18344, 8, 465, 14, 127, 25537, 37060, 17770, 127, 383, 13, 162, 4, 1336, 4045, 4, 849, 29, 9636, 16836, 849, 510, 17588, 1691, 2]",33,0.07071
1,3,If jean howie my neighbour is at my mums wedding it will just make the whole day cause she really likes me #sarcasm,"[0, 1106, 1236, 12001, 141, 324, 127, 14915, 16, 23, 127, 475, 8014, 3312, 24, 40, 95, 146, 5, 1086, 183, 1303, 79, 269, 3829, 162, 849, 29, 9636, 16836, 2]",31,0.402885
2,2,"@KTHopkins @MissKatiePrice LOL@ katie hopkins u can talk, shagging married men is your forté isn't it? Great person 2 point the finger #NOT","[0, 1039, 530, 3732, 1517, 7327, 787, 22885, 27029, 324, 36677, 39687, 1039, 449, 415, 324, 13591, 7327, 1717, 64, 1067, 6, 1481, 12771, 2997, 604, 16, 110, 15016, 1140, 965, 75, 24, 116, 2860, 621, 132, 477, 5, 8411, 849, 37049, 2]",43,0.324998
3,2,"@stuarteagle QPR? They looked terrible yesterday. Ferdinand, what a player #Not","[0, 1039, 620, 41962, 242, 21851, 1209, 4454, 116, 252, 1415, 6587, 2350, 4, 28855, 6, 99, 10, 869, 849, 7199, 2]",22,0.228498
4,5,"Next! Jamie Foxx ft. 2 Chainz ""Party Ain't a Party"" - Tune in and Tweet us #HOT or #NOT!!","[0, 19192, 328, 6541, 2063, 1178, 16935, 4, 132, 18610, 329, 22, 38210, 32431, 75, 10, 1643, 113, 111, 27879, 11, 8, 12244, 201, 849, 725, 3293, 50, 849, 37049, 12846, 2]",32,0.295919


In [121]:
import numpy as np
preds_valid = np.array(preds_valid)
preds_valid = preds_valid - 5

In [122]:

my_model_test_outputs = pd.DataFrame()
my_model_test_outputs['id'] = ids
my_model_test_outputs['output'] = preds_valid

In [123]:
my_model_test_outputs.head()

Unnamed: 0,id,output
0,537651335752323073,-3
1,538332513408937986,-2
2,538050779824025600,-2
3,538017499724279809,-2
4,537958766910926848,0


In [124]:
my_model_test_outputs.to_csv("exact_try.tsv", sep="\t", index=False)    #got cos = 0.82

## MSE

In [125]:
mse = nn.MSELoss()
input = torch.tensor(preds_valid)
target = torch.tensor((df_test['label']-5).to_numpy())
output = mse(input.float(), target.float())
output

tensor(2.5365)

In [113]:
COMBO_PATH = "C:/Users/Dennis/Desktop/Distillation/novelty saves"

torch.save (roberta_model,f'{COMBO_PATH}/roberta_model')
torch.save(model_sentiment.state_dict(),f'{COMBO_PATH}/model_sentiment_state_dict')
torch.save(optimizer.state_dict(),f'{COMBO_PATH}/optimizer_state_dict')
torch.save (model_sentiment,f'{COMBO_PATH}/model_sentiment')
torch.save (optimizer,f'{COMBO_PATH}/optimizer')
torch.save (learner,f'{COMBO_PATH}/learner')