In [38]:

from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt

In [39]:
dataset = load_dataset("sst", "default") #load dataset 
#Familiarize with the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})


In [40]:
#print first example of the training set
print(dataset['train'][:1])


{'sentence': ["The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."], 'label': [0.6944400072097778], 'tokens': ["The|Rock|is|destined|to|be|the|21st|Century|'s|new|``|Conan|''|and|that|he|'s|going|to|make|a|splash|even|greater|than|Arnold|Schwarzenegger|,|Jean-Claud|Van|Damme|or|Steven|Segal|."], 'tree': ['70|70|68|67|63|62|61|60|58|58|57|56|56|64|65|55|54|53|52|51|49|47|47|46|46|45|40|40|41|39|38|38|43|37|37|69|44|39|42|41|42|43|44|45|50|48|48|49|50|51|52|53|54|55|66|57|59|59|60|61|62|63|64|65|66|67|68|69|71|71|0']}


In [41]:
#splitting the dataset into training, val, and testing sets
train = {
    'sentence': dataset['train']['sentence'],
    'label': dataset['train']['label'],
    'tokens': dataset['train']['tokens']
}
val = {
    'sentence': dataset['validation']['sentence'],
    'label': dataset['validation']['label'],
    'tokens': dataset['validation']['tokens']
}
test = {
    'sentence': dataset['test']['sentence'],
    'label': dataset['test']['label'],
    'tokens': dataset['test']['tokens']
}

print(train['tokens'][:5])
print(val['tokens'][:5])
print(test['tokens'][:5])


["The|Rock|is|destined|to|be|the|21st|Century|'s|new|``|Conan|''|and|that|he|'s|going|to|make|a|splash|even|greater|than|Arnold|Schwarzenegger|,|Jean-Claud|Van|Damme|or|Steven|Segal|.", "The|gorgeously|elaborate|continuation|of|``|The|Lord|of|the|Rings|''|trilogy|is|so|huge|that|a|column|of|words|can|not|adequately|describe|co-writer\\/director|Peter|Jackson|'s|expanded|vision|of|J.R.R.|Tolkien|'s|Middle-earth|.", 'Singer\\/composer|Bryan|Adams|contributes|a|slew|of|songs|--|a|few|potential|hits|,|a|few|more|simply|intrusive|to|the|story|--|but|the|whole|package|certainly|captures|the|intended|,|er|,|spirit|of|the|piece|.', "You|'d|think|by|now|America|would|have|had|enough|of|plucky|British|eccentrics|with|hearts|of|gold|.", 'Yet|the|act|is|still|charming|here|.']
["It|'s|a|lovely|film|with|lovely|performances|by|Buy|and|Accorsi|.", 'No|one|goes|unindicted|here|,|which|is|probably|for|the|best|.', "And|if|you|'re|not|nearly|moved|to|tears|by|a|couple|of|scenes|,|you|'ve|got|ice|water|

### Data Preprocessing

In [42]:
#separating the data into classes
print(train['label'][0])
for i in range(len(train['sentence'])):
    if train['label'][i] >= 0 and train['label'][i] <= 0.2:
        train['label'][i] = 0
    elif train['label'][i] > 0.2 and train['label'][i] <= 0.4:
        train['label'][i] = 1
    elif train['label'][i] > 0.4 and train['label'][i] <= 0.6:
        train['label'][i] = 2
    elif train['label'][i] > 0.6 and train['label'][i] <= 0.8:
        train['label'][i] = 3
    elif train['label'][i] > 0.8 and train['label'][i] <= 1:
        train['label'][i] = 4
print(train['label'][0])    

0.6944400072097778
3


### Logisitc Regression

#### *Feature Representation*

In [44]:
#exploring bigrams in dataset
bigrams = {}
for sentence in train['tokens']:
    words = sentence.split('|')
    for w1, w2 in zip(words[:-1], words[1:]):
        if (w1, w2) in bigrams:
            bigrams[(w1, w2)] += 1
        else:
            bigrams[(w1, w2)] = 1
        
#print first 10 bigrams
count = 0
for key, value in bigrams.items():
    print(key, value)
    count += 1
    if count == 10:
        break

print(f"\nLength: {len(bigrams)}") #number of bigrams
    

('The', 'Rock') 8
('Rock', 'is') 1
('is', 'destined') 1
('destined', 'to') 4
('to', 'be') 243
('be', 'the') 28
('the', '21st') 3
('21st', 'Century') 2
('Century', "'s") 1
("'s", 'new') 4

Length: 87247


In [45]:
print(bigrams[('a', 'solid')]) #frequency of bigram ('a', 'solid')
print(bigrams[('21st', 'Century')]) #frequency of bigram ('a', 'rock')

print(len(train['tokens'])) #number of sentences in the training set

14
2
8544


In [None]:
#