In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")
dataset

Found cached dataset imdb (/home/coartix/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [18]:
import pandas as pd
from string import punctuation

In [22]:
pd.set_option('display.max_colwidth', 100)
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
df_unsupervised = pd.DataFrame(dataset['unsupervised'])
labels = df_train["label"].unique()

In [23]:
def preprocessingString(text: str) -> str:
    '''
        Preprocessing string
        Input:
            text: string
        Output:
            text: string
    '''
    text = text.lower().replace("<br />", " ")
    for punct in punctuation:
        if (not punct in str("-")):
            text = text.replace(punct, " ")
    return text

In [25]:
def parseLexicon(lexiconFile):
    lexicon = {}
    with open(lexiconFile, 'r') as f:
        for line in f:
            word, score = line.split('\t')[:2]
            if float(score) >= 1:
                lexicon[word] = 1
            elif float(score) <= -1:
                lexicon[word] = 0
    return lexicon

lexicon = parseLexicon('vader_lexicon.txt')
lexicon

{'$:': 0,
 '%-)': 0,
 "( '}{' )": 1,
 "('-:": 1,
 "(':": 1,
 '((-:': 1,
 '(*': 1,
 '(-*': 1,
 '(-:': 1,
 '(-:0': 1,
 '(-:o': 1,
 '(-:O': 1,
 '(-:|>*': 1,
 '(-;': 1,
 '(-;|': 1,
 '(8': 1,
 '(:': 1,
 '(:0': 1,
 '(:o': 1,
 '(:O': 1,
 '(;': 1,
 '(=': 1,
 '(?:': 1,
 '(^:': 1,
 '(^;': 1,
 '(^;0': 1,
 '(^;o': 1,
 '(o:': 1,
 ")':": 0,
 ")-':": 0,
 ')-:': 0,
 ')-:<': 0,
 ')-:{': 0,
 '):': 0,
 '):<': 0,
 '):{': 0,
 ');<': 0,
 '*-:': 1,
 '*-;': 1,
 '*:': 1,
 '*<|:-)': 1,
 '*\\0/*': 1,
 '*^:': 1,
 ',-:': 1,
 "---'-;-{@": 1,
 '--<--<@': 1,
 '.-:': 0,
 '..###-:': 0,
 '..###:': 0,
 '/-:': 0,
 '/:': 0,
 '/:<': 0,
 '/^:': 0,
 '/o:': 0,
 '0-|': 0,
 '0:)': 1,
 '0:-)': 1,
 '0:-3': 1,
 '0:03': 1,
 '0;^)': 1,
 '10q': 1,
 '1337': 1,
 '143': 1,
 '1432': 1,
 '14aa41': 1,
 '182': 0,
 '187': 0,
 '2g2b4g': 1,
 '2qt': 1,
 '3:(': 0,
 '3:-(': 0,
 '3:-)': 0,
 '4col': 0,
 '4q': 0,
 '5fs': 1,
 '8)': 1,
 '8-d': 1,
 '86': 0,
 '8d': 1,
 ':###..': 0,
 ":'(": 0,
 ":')": 1,
 ":'-(": 0,
 ":'-)": 1,
 ':(': 0,
 ':)': 1,
 ':*': 

In [26]:
import math

negateWords = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

def getFeatures(text, lexicon):
    features = []
    features.append(1 if "no" in text else 0)
    features.append(text.count("i") + text.count('you'))
    features.append(1 if "!" in text else 0)
    features.append(math.log(len(text.split())))
    nb_pos = 0
    nb_neg = 0
    nb_negate = 0
    for word in text.split():
        if word in lexicon:
            if lexicon[word] == 1:
                nb_pos += 1
            else:
                nb_neg += 1
        if word in negateWords:
            nb_negate += 1
    features.append(nb_pos)
    features.append(nb_neg)
    #Bonus
    features.append(nb_negate)
    return features

getFeatures("blind date columbia pictures 1934 was a aint decent film but i have a few issues with this film first of all i don t fault the actors in this film at all but more or less i have a problem with the script also i understand that this film was made in the 1930 s and people were looking to escape reality but the script made ann sothern s character look weak she kept going back and forth between suitors and i felt as though she should have stayed with paul kelly s character in the end he truly did care about her and her family and would have done anything for her and he did by giving her up in the end to fickle neil hamilton who in my opinion was only out for a good time paul kelly s character although a workaholic was a man of integrity and truly loved kitty ann sothern as opposed to neil hamilton while he did like her a lot i didn t see the depth of love that he had for her character the production values were great but the script could have used a little work", lexicon)

[0, 58, 0, 5.2832037287379885, 11, 4, 1]