In [1]:

from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("sst", "default") #load dataset 
#Familiarize with the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})


In [3]:
#print first example of the training set
print(dataset['train'][:1])


{'sentence': ["The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."], 'label': [0.6944400072097778], 'tokens': ["The|Rock|is|destined|to|be|the|21st|Century|'s|new|``|Conan|''|and|that|he|'s|going|to|make|a|splash|even|greater|than|Arnold|Schwarzenegger|,|Jean-Claud|Van|Damme|or|Steven|Segal|."], 'tree': ['70|70|68|67|63|62|61|60|58|58|57|56|56|64|65|55|54|53|52|51|49|47|47|46|46|45|40|40|41|39|38|38|43|37|37|69|44|39|42|41|42|43|44|45|50|48|48|49|50|51|52|53|54|55|66|57|59|59|60|61|62|63|64|65|66|67|68|69|71|71|0']}


In [38]:
#splitting the dataset into training, val, and testing sets
train = {
    'sentence': dataset['train']['sentence'],
    'label': dataset['train']['label'],
    'tokens': dataset['train']['tokens']
}
val = {
    'sentence': dataset['validation']['sentence'],
    'label': dataset['validation']['label'],
    'tokens': dataset['validation']['tokens']
}
test = {
    'sentence': dataset['test']['sentence'],
    'label': dataset['test']['label'],
    'tokens': dataset['test']['tokens']
}

print(train['tokens'][:5])
print(val['tokens'][:5])
print(test['tokens'][:5])


["The|Rock|is|destined|to|be|the|21st|Century|'s|new|``|Conan|''|and|that|he|'s|going|to|make|a|splash|even|greater|than|Arnold|Schwarzenegger|,|Jean-Claud|Van|Damme|or|Steven|Segal|.", "The|gorgeously|elaborate|continuation|of|``|The|Lord|of|the|Rings|''|trilogy|is|so|huge|that|a|column|of|words|can|not|adequately|describe|co-writer\\/director|Peter|Jackson|'s|expanded|vision|of|J.R.R.|Tolkien|'s|Middle-earth|.", 'Singer\\/composer|Bryan|Adams|contributes|a|slew|of|songs|--|a|few|potential|hits|,|a|few|more|simply|intrusive|to|the|story|--|but|the|whole|package|certainly|captures|the|intended|,|er|,|spirit|of|the|piece|.', "You|'d|think|by|now|America|would|have|had|enough|of|plucky|British|eccentrics|with|hearts|of|gold|.", 'Yet|the|act|is|still|charming|here|.']
["It|'s|a|lovely|film|with|lovely|performances|by|Buy|and|Accorsi|.", 'No|one|goes|unindicted|here|,|which|is|probably|for|the|best|.', "And|if|you|'re|not|nearly|moved|to|tears|by|a|couple|of|scenes|,|you|'ve|got|ice|water|

### Data Preprocessing

In [5]:
#separating the data into classes
print(train['label'][0])
for i in range(len(train['sentence'])):
    if train['label'][i] >= 0 and train['label'][i] <= 0.2:
        train['label'][i] = 0
    elif train['label'][i] > 0.2 and train['label'][i] <= 0.4:
        train['label'][i] = 1
    elif train['label'][i] > 0.4 and train['label'][i] <= 0.6:
        train['label'][i] = 2
    elif train['label'][i] > 0.6 and train['label'][i] <= 0.8:
        train['label'][i] = 3
    elif train['label'][i] > 0.8 and train['label'][i] <= 1:
        train['label'][i] = 4
print(train['label'][0])    

0.6944400072097778
3


### Logisitc Regression

#### *Feature Representation*

In [17]:
#bigrams in dataset each have an index
bigrams = {}
bigram_vector = []
i = 0
for sentence in train['tokens']:
    vector = []
    words = sentence.split('|')
    for w1, w2 in zip(words[:-1], words[1:]):
        if (w1, w2) not in bigrams:
            bigrams[(w1, w2)] = i
            vector.append(bigrams[(w1, w2)])
            i += 1
        else:
            vector.append(bigrams[(w1, w2)])
    bigram_vector.append(vector)

#print first 10 bigrams
count = 0
for key, value in bigrams.items():
    print(key, value)
    count += 1
    if count == 10:
        break

print(f"\nLength: {len(bigrams)}") #number of bigrams
print(bigrams)
    

('The', 'Rock') 0
('Rock', 'is') 1
('is', 'destined') 2
('destined', 'to') 3
('to', 'be') 4
('be', 'the') 5
('the', '21st') 6
('21st', 'Century') 7
('Century', "'s") 8
("'s", 'new') 9

Length: 87247


In [42]:
#explore datagram_vectors of first sentence
print(train['tokens'][:1])
print(f"{bigram_vector[0]}\n")

#last sentence
print(train['tokens'][-1])
print(f"{bigram_vector[-1]}")

["The|Rock|is|destined|to|be|the|21st|Century|'s|new|``|Conan|''|and|that|he|'s|going|to|make|a|splash|even|greater|than|Arnold|Schwarzenegger|,|Jean-Claud|Van|Damme|or|Steven|Segal|."]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]

In|this|case|zero|.
[69495, 18116, 87246, 52783]


In [56]:
print(len(train['tokens'])) #number of sentences in the training set

print(len(bigram_vector)) #number of bigram vectors

8544
8544
<class 'list'>


In [54]:
#adjust datagram_vector to their corresponding labels
list_tuples = list(zip(bigram_vector, train['label']))

print(list_tuples)
print(list_tuples[-1])

[([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34], 0.6944400072097778), ([35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70], 0.833329975605011), ([71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 80, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 43, 105, 106], 0.625), ([107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124], 0.5), ([125, 126, 127, 128, 129, 130, 131], 0.7222200036048889), ([132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 13, 148, 145, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160], 0.833329975605011), ([161, 162, 163, 164, 165, 166, 167, 168, 169, 43, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180], 0.875), ([181, 43, 182, 183, 184, 185, 186, 187, 188, 189, 190, 19

#### *Algorithm Implementation:*

In [74]:
#convert bigram to vector numpy 
def get_input_vector(bigram_vector, len_bigram_corpus):
    input_vector = np.zeros(len_bigram_corpus + 1)
    input_vector[0] = 1
    for bigram in bigram_vector:
        input_vector[bigram + 1] = 1
    return input_vector

In [76]:
#check the function
print(get_input_vector(bigram_vector[0], len(bigrams)))
print(get_input_vector(bigram_vector[-1], len(bigrams)).shape)

[1. 1. 1. ... 0. 0. 0.]
(87248,)


In [79]:
#softmax function
def softmax(x):
    return (np.exp(x - np.max(x)) / np.sum(np.exp(x - max(x)), axis=1, keepdims=True))

