In [1]:
drive_path = 'drive/My Drive/CSE291A/kaggle'

In [2]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 4.6MB/s 
[?25hCollecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 22.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 52.1MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1

In [3]:
import pickle
import itertools
import re
import os
import time
import nltk
import string
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import tqdm.notebook as tqdm
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from gensim import utils, models
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
stop_words = stopwords.words('english')

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_punc(text):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return text.translate(table)

def remove_digits(text): 
    pattern = '[0-9]'
    text = re.sub(pattern, '', text)
    return text

def html_unescape(text):
    return html.unescape(text)

def reduce_length(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

#tokenize sentence and correct the spelling
def token_n_spellcheck(text):
    words = word_tokenize(text)
    reduced_text  = [reduce_length(word) for word in words]
    stemmer = SnowballStemmer("english")
    stem_text = [stemmer.stem(word) for word in reduced_text if word not in stop_words]

    return stem_text

# the pipeline function for text cleaning
def text_clean(text):
    text = text.lower()
    text = remove_URL(text)
    text = remove_html(text)
    text = remove_digits(text)
    text = remove_punc(text)
    words = token_n_spellcheck(text)
    return words

In [6]:
label2ind = {'american (new)': 0,
             'american (traditional)': 1,
             'asian fusion': 2,
             'canadian (new)': 3,
             'chinese': 4,
             'italian': 5,
             'japanese': 6,
             'mediterranean': 7,
             'mexican': 8,
             'thai': 9}
ind2label = {ind: name for name, ind in label2ind.items()}

data_path = os.path.join(drive_path, 'Data')

df_train = pd.read_csv(os.path.join(data_path, 'train.csv'))
df_test = pd.read_csv(os.path.join(data_path, 'test.csv'))

df_train['text'] = df_train['review']
df_test['text'] = df_test['review']

train_docs = [text_clean(doc) for doc in df_train['text']]
train_labels = [label2ind[name] for name in df_train['label']]
test_docs = [text_clean(doc) for doc in df_test['text']]

In [48]:
X_train = [' '.join(doc) for doc in train_docs]
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_emb = tokenizer(X_train, max_length=400, truncation=True, padding='max_length')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [59]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train_emb, train_labels)

In [61]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=10)
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in tqdm.tqdm(range(5)):
    for batch in tqdm.tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        print (loss.data.cpu().numpy())
        loss.backward()
        optim.step()

model.eval()

model.save_pretrained(os.path.join(drive_path, 'finetuned_models'))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

2.1989748
2.5748928
2.2180269
2.2882204
2.2170558
2.1779637
2.4199426
2.2792664
2.2120595
2.199869
2.185794
2.1968222
2.06965
2.0937636
2.2996745
2.3268623
2.1460786
2.1410315
2.3017182
2.1829557
2.0799155
2.1705554
2.064854
2.103933
2.2197216
2.197012
2.1572652
2.3573656
1.9815193
2.1768765
1.9851705
2.297097
2.1533425
1.9500157
1.9658244
2.1305041
2.1305168
2.135288
2.554948
2.1542914
2.0417786
1.9033307
2.172828
2.2347293
2.048949
1.9433818
2.177347
2.0856733
2.1574025
1.9356805
2.1735451
2.1085699
2.1560125
2.1610081
2.1073802
2.1519425
2.0883613
2.0851007
2.1246047
2.173697
2.408918
2.2089157
2.1086917
2.2672968
2.0050008
2.2712538
1.9624896
2.1510813
2.0054762
2.1472151
1.8827766
2.0528903
2.1366065
2.1727166
2.1798217
2.1261377
2.1143148
2.083239
2.2898316
2.162964
2.0441666
2.1041586
1.980486
1.9870085
2.2659159
2.2076259
1.9828482
2.2860036
2.0213728
2.0893905
2.0564744
1.8573906
2.209079
2.1043336
1.8694698
2.1258972
1.9235437
1.8220596
2.227104
2.3356466
2.0542367
1.9124506


HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

0.7454518
0.46855563
0.5548513
0.8362383
0.2936845
0.30242133
0.25033617
0.9357545
0.34422514
0.42841005
0.3200225
1.1187217
0.4119235
1.5383288
1.0368505
0.87722296
0.6318321
0.6578948
0.52778584
0.51064175
0.72528666
1.0349779
0.90428597
0.54714495
0.77162594
0.6446375
0.71766233
0.38341317
0.6895982
0.32177165
0.30249542
0.6385205
0.646574
0.44622242
0.6836249
0.59726894
0.48420373
0.4968291
0.79349583
0.65444654
0.85609233
0.64101285
0.47742364
0.62036437
0.38581795
0.61271554
0.8213922
0.54090625
0.5318576
0.38671836
0.6002632
0.73454094
0.25803503
0.93710965
1.1752175
0.67692417
0.6284657
0.4274125
0.9511967
0.5747295
0.8738112
1.007909
1.2556001
0.8271838
0.80522966
0.55251324
0.39953917
0.4788799
0.38284248
0.8357914
0.3046379
0.59341604
0.6270765
0.5173999
0.4495425
0.86364424
0.97073036
0.39061642
0.57214427
0.5757231
0.7293133
0.79609144
0.83587754
0.68183815
0.5836227
0.34639308
0.5107642
0.35031292
0.9544096
0.76234746
1.2772685
0.41299897
0.79968274
0.515108
0.3194165
1.1

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

0.60926497
0.48790887
0.2374809
0.54523766
0.41636044
0.55931926
0.38962618
0.7026671
0.5730678
0.23795915
0.5583066
0.31204283
0.34979755
0.55922043
0.4555128
0.57098174
0.46287543
0.32249898
0.68784523
0.5175723
0.7130109
0.6272559
0.41682103
0.48041874
0.7129172
0.39974326
0.5417552
0.2611337
0.42729896
0.64916295
0.40180257
0.44584998
0.4432682
0.47060388
0.31142995
0.38177466
0.3247301
0.28707838
0.31679192
0.5047125
0.19427708
0.23996304
0.5946475
0.70958316
0.42410845
0.36129522
0.9831602
0.32967278
0.46227375
0.3856691
0.4319176
0.322929
0.28660238
0.44957647
0.5238036
0.3563679
0.53096175
0.2841939
0.616763
0.21434863
0.25248423
0.7982718
1.1218276
0.16992879
0.32381022
0.30299026
0.87482977
0.46944213
0.5012348
0.7007489
0.39704192
0.70300627
0.64656955
0.43728885
0.57772124
0.43245643
0.64103645
0.48850918
0.82260644
0.6900224
0.6224555
0.628151
0.487081
0.33086893
0.21485682
0.8808301
0.3482782
0.46113363
0.3435907
0.4813869
0.3736678
0.80953294
0.53457016
0.36386216
0.1550

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

0.3717734
0.16605099
0.25578684
0.40705442
1.0635207
0.2775807
0.3458522
0.6907343
0.2623925
0.3548687
0.3058762
0.37138197
0.30686042
0.27531904
0.5777905
0.3194645
0.43966013
0.5693595
0.3402917
0.40803045
0.13258441
0.7007983
0.2169528
0.2747464
0.17283341
0.18873842
0.44723034
0.6003322
0.35089645
0.14753258
0.56706357
0.63623023
0.5557006
0.41773042
0.37575918
0.8934717
0.90054005
0.16523424
0.37996218
0.49934196
0.14318839
0.28232586
0.82066214
0.47124198
0.23310694
0.14926751
0.35393542
0.27573037
0.22415176
0.26260993
0.09005853
0.45061043
0.3627276
0.16317248
0.18467562
0.41039455
0.1444507
0.5443894
0.21793999
0.6334407
0.110206336
0.35877824
0.85555655
0.57815343
0.38707966
0.4148308
0.912637
0.5161525
0.0877906
0.7196961
0.25755787
0.32105532
0.39403716
0.16085084
0.06832553
0.5816737
0.20416021
0.21416391
0.5112302
0.30227163
0.47254756
0.8296356
0.8620707
0.36443955
0.41879156
0.4316747
0.43011063
0.16754141
0.6653446
0.105496235
0.57275754
0.76639944
0.35171404
0.3591030

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

0.73005533
0.26735827
0.09224359
0.36742625
0.30417952
0.23322932
0.6129324
0.1637976
0.25744265
0.3550655
0.23618457
0.2692643
0.44195598
0.4134898
0.2137643
0.6076094
0.093312144
0.19681585
0.47892678
0.43873698
0.34311277
0.4259787
0.39347562
0.47767293
0.61442554
0.61249447
0.23112616
0.56008846
0.33129522
0.27353942
0.32033294
0.10115486
0.19036861
0.35739112
0.37730473
0.51666754
0.5927552
0.16456191
0.26048005
0.154973
0.17369553
1.087519
0.30915773
0.34851417
0.27821237
0.09638627
0.6487431
0.8119139
0.18483402
0.3318303
0.22342557
0.08966512
0.26219264
0.5969016
0.5109953
0.6364211
0.7209879
0.20952536
0.5237096
0.19921498
0.40077677
0.47064206
0.29144847
0.16331628
0.35302034
0.29419845
0.17010777
0.52849233
0.22387756
0.21119519
0.14566585
0.83003193
0.6018226
0.56356376
0.5481811
0.14135031
0.3129254
0.1774967
0.46158367
0.47550863
0.17205134
0.37737915
0.15260953
0.07676764
0.5838096
0.5461337
0.26146317
0.26813054
0.48773223
0.21688661
0.16462897
0.39982498
0.32427868
0.1

In [73]:
X_test = [' '.join(doc) for doc in test_docs]
X_test_emb = tokenizer(X_test, max_length=400, truncation=True, padding='max_length')
test_dataset = TextDataset(X_test_emb, np.zeros(len(X_test_emb['input_ids'])))


test_loader = DataLoader(test_dataset, batch_size=4)
y_pre = list()
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    pre = np.argmax(outputs[0].data.cpu().numpy(), axis=1)
    y_pre.extend(pre)


In [74]:
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(y_pre):
    dic["Id"].append(i)
    dic["Predicted"].append(ind2label[pred])

In [75]:
dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(os.path.join(data_path, 'predicted-bert-finetuned.csv'), index=False)

In [5]:
class CenterLoss(nn.Module):
    """Center loss.
    
    Reference:
    Wen et al. A Discriminative Feature Learning Approach for Deep Face Recognition. ECCV 2016.
    
    Args:
        num_classes (int): number of classes.
        feat_dim (int): feature dimension.
    """
    def __init__(self, num_classes=10, feat_dim=2, use_gpu=True):
        super(CenterLoss, self).__init__()
        self.num_classes = num_classes
        self.feat_dim = feat_dim
        self.use_gpu = use_gpu

        if self.use_gpu:
            self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim).cuda())
        else:
            self.centers = nn.Parameter(torch.randn(self.num_classes, self.feat_dim))

    def forward(self, x, labels):
        """
        Args:
            x: feature matrix with shape (batch_size, feat_dim).
            labels: ground truth labels with shape (batch_size).
        """
        batch_size = x.size(0)
        distmat = torch.pow(x, 2).sum(dim=1, keepdim=True).expand(batch_size, self.num_classes) + \
                  torch.pow(self.centers, 2).sum(dim=1, keepdim=True).expand(self.num_classes, batch_size).t()
        distmat.addmm_(1, -2, x, self.centers.t())

        classes = torch.arange(self.num_classes).long()
        if self.use_gpu: 
            classes = classes.cuda()
        labels = labels.unsqueeze(1).expand(batch_size, self.num_classes)
        mask = labels.eq(classes.expand(batch_size, self.num_classes))

        dist = distmat * mask.float()
        loss = dist.clamp(min=1e-12, max=1e+12).sum() / batch_size

        return loss

In [6]:
center_loss = CenterLoss(num_classes=10, feat_dim=2, use_gpu=True)


In [4]:
model = BertForSequenceClassification.from_pretrained(os.path.join(drive_path, 'finetuned_models'), num_labels=10)
