In [None]:
# Abhinav Gupta Test

In [74]:
# packages required for new environment
! pip install pandas
! pip install nltk
! pip install datasets
! pip install tqdm
! pip install ipywidgets
! pip install Pillow



In [75]:
#Libraries
import pandas as pd
import datasets
import nltk

# Dataset 1

In [80]:
# 1. tiny-imagenet
#   (a) Dataset Summary - Tiny ImageNet contains 100,000+ images of 200 classes (500 for each class) downsized to 64×64 colored images. Each class has 500 training images, 50 validation images, and 50 test images.
#   (b) Data Feature Dimensions -
#       i. Image: A PIL.Image.Image object containing the image.
#       ii. Label: an int classification label. -1 for the test set as the labels are missing. Check classes.py for the map of numbers and labels.

In [77]:
from datasets import load_dataset

tiny_imagenet_train = pd.DataFrame(load_dataset("zh-plus/tiny-imagenet", split="train"))
tiny_imagenet_valid = pd.DataFrame(load_dataset("zh-plus/tiny-imagenet", split="valid"))
print(tiny_imagenet_train.label.value_counts())
print(tiny_imagenet_valid.label.value_counts())
# Note - need to figure out test set because test split for tiny imagenet is not provided

label
0      500
137    500
127    500
128    500
129    500
      ... 
69     500
70     500
71     500
72     500
199    500
Name: count, Length: 200, dtype: int64
label
0      50
137    50
127    50
128    50
129    50
       ..
69     50
70     50
71     50
72     50
199    50
Name: count, Length: 200, dtype: int64


# Regularization Techniques for Dataset 1

In [78]:
# 1. L2 Regularization - modifies the loss function. Applied to both datasets.
# 2. Data Augmentation - modifies the data. For Dataset1 we plan to use RandomErasing
#              - RandomErasing is concerned about removing and randomly adding information on the
#               blank space, such as noise. For Dataset2 we plan to use Random Synonym Replacement -
#               Random Synonym Replacement is concerned about removing and replacing with a synonym.
# 3. MaxDropout - modifies training approach. Applied to both datasets.
# 4. Ensemble Regularization 1 - applying RandomErasing and MaxDropout together. Applied to Dataset1.

# Dataset 2

In [79]:
# 2. nltk-brown + nltk-treebank + nltk-conll2000
#   (a) Dataset Summary - The combination of these 3 datasets gives us a large corpus of
#                       textual data that can be used for training a model that performs sequence labeling with
#                       a total size of 72,000+ tagged sentences. The nltk library takes the base dataset and
#                       performs tokenization to prepare it for the task of sequence labeling.
#   (b) Data Feature Dimensions -
#         i. Input Sequence - A sentence in english.
#        ii. Output Sequence - POS tags of each word of the sentence.

In [80]:
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
nltk.download('brown')
nltk.download('treebank')
nltk.download('conll2000')
nltk.download('universal_tagset')


[nltk_data] Downloading package brown to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [81]:
treebank = treebank.tagged_sents(tagset='universal')
brown = brown.tagged_sents(tagset='universal')
conll2000 = conll2000.tagged_sents(tagset='universal')
print(treebank[0])
print(brown[0])
print(conll2000[0])

[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')]
[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]
[('Confidence', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('pound', 'NOUN'), ('is', 'VERB'), ('widely', 'ADV'), ('expected', 'VERB'), ('to', 'PRT'), ('take', 'VERB'), ('another', 'DET'), ('sharp', 'ADJ'), ('dive', 'NOUN'), ('if

In [82]:
def create_dataset(data):
    sentences = []
    pos_tags = []

    for sequence in data:
        sentence = []
        tags = []
        for seq in sequence:         
            sentence.append(seq[0])
            tags.append(seq[1])
            
        sentences.append(sentence)
        pos_tags.append(tags)
    return pd.DataFrame(zip(sentences, pos_tags), columns=["sentences", "pos_tags"])

In [83]:
corpus_dataset = create_dataset(treebank + brown + conll2000)
corpus_dataset.head(5)

Unnamed: 0,sentences,pos_tags
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NOUN, NOUN, ., NUM, NOUN, ADJ, ., VERB, VERB,..."
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NOUN, NOUN, VERB, NOUN, ADP, NOUN, NOUN, ., D..."
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NOUN, NOUN, ., NUM, NOUN, ADJ, CONJ, ADJ, NOU..."
3,"[A, form, of, asbestos, once, used, *, *, to, ...","[DET, NOUN, ADP, NOUN, ADV, VERB, X, X, PRT, V..."
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DET, NOUN, NOUN, ., NOUN, ., VERB, ADV, ADJ, ..."


# Regularization Techniques for Dataset 2

In [84]:
# 1. L2 Regularization - modifies the loss function. Applied to both datasets.
# 2. Data Augmentation - modifies the data. For Dataset1 we plan to use RandomErasing
#       - RandomErasing is concerned about removing and randomly adding information on the
#         blank space, such as noise. For Dataset2 we plan to use Random Synonym Replacement -
#         Random Synonym Replacement is concerned about removing and replacing with a synonym.
# 3. MaxDropout - modifies training approach. Applied to both datasets.
# 4. Ensemble Regularization 2 - applying RandomSynonymReplacement and MaxDropout together. Applied to Dataset2.