## Text Vectorization

### Lemmatization

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer  = WordNetLemmatizer()

In [2]:
lemmatizer?

In [7]:
lemmatizer.lemmatize('heavens')

'heaven'

In [10]:
text = "All models are wrong, but some are useful."

tokens = word_tokenize(text.lower())

lemmas = [lemmatizer.lemmatize(token) for token in tokens]

In [11]:
print(lemmas)

['all', 'model', 'are', 'wrong', ',', 'but', 'some', 'are', 'useful', '.']


In [12]:
" ".join(lemmas)

'all model are wrong , but some are useful .'

In [14]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

doc = nlp(text.lower())

lemmas = [token.lemma_ for token in doc]

print(" ".join(lemmas))

ModuleNotFoundError: No module named 'spacy'

In [None]:
#lemmatization using spaCy
import pandas as pd
import random             # in order to select a random review
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data = pd.read_csv('/datasets/imdb_reviews_small.tsv', sep='\t')
corpus = data['review']

def lemmatize(text):

    # < write code here >
    text = nlp(text.lower())
    lemmas = [token.lemma_ for token in text]
    text_lemmas = " ".join(lemmas)
    return text_lemmas

# store the review index in the review_idx variable
# either as a random number or a fixed value, e.g. 2557 
#review_idx = random.randint(0, len(corpus)-1)
review_idx = 2557

review = corpus[review_idx]

print("The original text:", review)
print()
print("The lemmatized text:", lemmatize(review))

### Regular Expressions

In [1]:
import re

In [16]:
# pattern
# substitution — what each pattern match should be substituted with
# text — the text which the function scans for pattern matches
re.sub(pattern, substitution, text)

NameError: name 'pattern' is not defined

In [18]:
print("Hello!\n")

print(r"Hello!\n")

Hello!

Hello!\n


In [None]:
# a range of letters is indicated by a hyphen:
# a-z = abcdefghijklmnopqrstuvwxyz
r"[a-zA-Z]"

In [None]:
#find apostrophes as well
r"[a-zA-Z']"

In [4]:
# review text
text = """
I liked this show from the first episode I saw, which was the "Rhapsody in Blue" episode (for those that don't know what that is, the Zan going insane and becoming pau lvl 10 ep). Best visuals and special effects I've seen on a television series, nothing like it anywhere.
"""
text_sub = re.sub(r"[^a-zA-Z']", " ", text)

In [5]:
text_split = text_sub.split()

In [6]:
" ".join(text_split)

"I liked this show from the first episode I saw which was the Rhapsody in Blue episode for those that don't know what that is the Zan going insane and becoming pau lvl ep Best visuals and special effects I've seen on a television series nothing like it anywhere"

In [20]:
text = "            I   liked   this   show   "
text.split()

['I', 'liked', 'this', 'show']

In [21]:
" ".join(['I', 'liked', 'this', 'show'])

'I liked this show'

In [None]:
import random             # in order to select a random review
import pandas as pd

import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data = pd.read_csv('/datasets/imdb_reviews_small.tsv', sep='\t')
corpus = data['review']

def clear_text(text):
    
    # < write code here >
    text = re.sub(r"[^a-zA-Z']", " ", text)
    text = text.split()
    text = " ".join(text)
    return text

def lemmatize(text):

    doc = nlp(text.lower())
    
    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_)
        
    return ' '.join(lemmas)

# store the review index in the review_idx variable
# either as a random number or a fixed value, e.g. 2557 
review_idx = random.randint(0, len(corpus)-1)
# review_idx = 2557

review = corpus[review_idx]

print("The original text:", review)
print()
print("The lemmatized text:", lemmatize(clear_text(review)))

In [3]:
import pandas as pd
data = pd.read_csv('datasets/imdb_reviews_small_lemm_train.tsv', sep='\t')
corpus = data['review_lemm']

In [6]:
data.sample(10)

Unnamed: 0,tconst,original_title,review,review_lemm,pos
302,tt0277615,Bug,As a producer of indie movies and a harsh crit...,as a producer of indie movie and a harsh criti...,1
1127,tt0298482,Nothing,"Usually, I know after the first minute of a mo...",usually i know after the first minute of a mov...,0
265,tt0062751,Boom,"""Boom"" has garnered itself a something of a re...",boom have garner -PRON- a something of a reput...,0
1363,tt0114287,Rob Roy,This is the moving tale of Scotland's legendar...,this be the move tale of scotland 's legendary...,1
1695,tt0116277,The Fan,What a bad movie. I'm really surprised that De...,what a bad movie -PRON- be really surprised th...,0
2021,tt0049966,Written on the Wind,"On the surface, ""Written on the Wind"" is a lur...",on the surface write on the wind be a lurid gl...,1
409,tt0115907,City Hall,I read the negative comments before viewing th...,i read the negative comment before view this f...,1
863,tt0139388,It Had to Be You,"I'd have given this film a few stars, simply b...",-PRON- 'd have give this film a few star simpl...,1
1377,tt0114287,Rob Roy,From the excellent acting of an extremely impr...,from the excellent acting of an extremely impr...,1
161,tt0493424,Attack Force,Steven Seagal....how could you be a part of su...,steven seagal how could -PRON- be a part of su...,0


In [8]:
import re
data['review_norm'] = data['review'].apply(lambda x: re.sub(r"[^a-zA-Z]", " ", x).lower())

In [10]:
data.sample(20)

Unnamed: 0,tconst,original_title,review,review_lemm,pos,review_norm
797,tt0164052,Hollow Man,A Pentagon science team seem to have perfected...,a pentagon science team seem to have perfect a...,1,a pentagon science team seem to have perfected...
578,tt0101811,Enchanted April,For anyone who's judged others at first meetin...,for anyone who be judge other at first meeting...,1,for anyone who s judged others at first meetin...
104,tt0331175,Any Way the Wind Blows,This whirling movie looks more like a combinat...,this whirl movie look more like a combination ...,0,this whirling movie looks more like a combinat...
84,tt0076085,Una giornata particolare,Una giornata particolare is a film which has m...,una giornata particolare be a film which have ...,1,una giornata particolare is a film which has m...
778,tt0164052,Hollow Man,Paul Verhoeven finally bombed out on this one...,paul verhoeven finally bomb out on this one -P...,0,paul verhoeven finally bombed out on this one...
83,tt0076085,Una giornata particolare,Ettore Scola is one of the most important Ital...,ettore scola be one of the most important ital...,1,ettore scola is one of the most important ital...
493,tt0231775,Down to Earth,The main character Lance Barton gets killed an...,the main character lance barton get kill and t...,0,the main character lance barton gets killed an...
1844,tt0074811,Le locataire,Roman Polanski masterfully directs this sort o...,roman polanski masterfully direct this sort of...,1,roman polanski masterfully directs this sort o...
858,tt0139388,It Had to Be You,"Sure, this movie is sappy and sweet and full o...",sure this movie be sappy and sweet and full of...,1,sure this movie is sappy and sweet and full o...
900,tt0190524,Left Behind,Surprisingly enough does movie does have some ...,surprisingly enough do movie do have some rede...,0,surprisingly enough does movie does have some ...


### Bag of Words

In [None]:
import spacy
from collections import Counter

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

text = """For want of a nail the shoe was lost. For want of a shoe the horse was lost. For want of a horse the rider was lost."""

doc = nlp(text)

tokens = [token.lemma_ for token in doc if not token.is_punct]

bow = Counter(tokens)

vector = [bow[token] for token in sorted(bow)]

print(vector)

### N-Grams

In [17]:
import re

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [18]:
s = """
    Natural-language 56 processing (NLP) is an area of
    computer 34science and 66artificial intelligence
    12concerned with the 89interactions between computers
    and human (natural)33 languages.
"""

In [19]:
generate_ngrams(s, 5)

['\n natural language processing nlp',
 'natural language processing nlp is',
 'language processing nlp is an',
 'processing nlp is an area',
 'nlp is an area of\n',
 'is an area of\n computer',
 'an area of\n computer science',
 'area of\n computer science and',
 'of\n computer science and artificial',
 'computer science and artificial intelligence\n',
 'science and artificial intelligence\n concerned',
 'and artificial intelligence\n concerned with',
 'artificial intelligence\n concerned with the',
 'intelligence\n concerned with the interactions',
 'concerned with the interactions between',
 'with the interactions between computers\n',
 'the interactions between computers\n and',
 'interactions between computers\n and human',
 'between computers\n and human natural',
 'computers\n and human natural languages',
 'and human natural languages \n']

In [29]:
# Sample sentence
s = "one two three four five"

tokens = s.split(" ")
# tokens = ["one", "two", "three", "four", "five"]
print(tokens)

sequences = [tokens[i:] for i in range(3)]
# The above will generate sequences of tokens starting
# from different elements of the list of tokens.
# The parameter in the range() function controls
# how many sequences to generate.
#
# sequences = [
#   ['one', 'two', 'three', 'four', 'five'],
#   ['two', 'three', 'four', 'five'],
#   ['three', 'four', 'five']]
print(sequences)

bigrams = zip(*sequences)
print(list(bigrams))
# The zip function takes the sequences as a list of inputs
# (using the * operator, this is equivalent to
# zip(sequences[0], sequences[1], sequences[2]).
# Each tuple it returns will contain one element from
# each of the sequences.
# 
# To inspect the content of bigrams, try:
# print(list(bigrams))
# which will give the following:
#
# [
#   ('one', 'two', 'three'),
#   ('two', 'three', 'four'),
#   ('three', 'four', 'five')
# ]
#
# Note: even though the first sequence has 5 elements,
# zip will stop after returning 3 tuples, because the
# last sequence only has 3 elements. In other words,
# the zip function automatically handles the ending of
# the n-gram generation.

['one', 'two', 'three', 'four', 'five']
[['one', 'two', 'three', 'four', 'five'], ['two', 'three', 'four', 'five'], ['three', 'four', 'five']]
[('one', 'two', 'three'), ('two', 'three', 'four'), ('three', 'four', 'five')]


### Using NLTK

In [32]:
import re
from nltk.util import ngrams

s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
output = list(ngrams(tokens, 3))

In [33]:
print(output)

[('one', 'two', 'three'), ('two', 'three', 'four'), ('three', 'four', 'five')]


### Creating a bag of words

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
count_vect = CountVectorizer()

In [37]:
corpus = [
    'for want of a nail the shoe be lose',
    'for want of a shoe the horse be lose',
    'for want of a horse the rider be lose',
    'for want of a rider the message be lose',
    'for want of a message the battle be lose',
    'for want of a battle the kingdom be lose',
    'and all for the want of a horseshoe nail'
]

In [38]:
#The counter extracts unique words from the corpus and counts how many times they appear in each text of the corpus. 
#The counter doesn't count separate letters.
# bow = bag of words
bow = count_vect.fit_transform(corpus)

In [39]:
bow.shape
#16 unique words, 7 rows

(7, 16)

In [41]:
print(bow.toarray())

[[0 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1]
 [0 0 0 1 1 1 0 0 1 0 0 1 0 1 1 1]
 [0 0 0 1 1 1 0 0 1 0 0 1 1 0 1 1]
 [0 0 0 1 1 0 0 0 1 1 0 1 1 0 1 1]
 [0 0 1 1 1 0 0 0 1 1 0 1 0 0 1 1]
 [0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1]
 [1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1]]


In [42]:
#The list of unique words in the bag is called a vocabulary. 
#It's stored in the counter and can be accessed by calling the get_feature_names() method:
count_vect.get_feature_names()

['all',
 'and',
 'battle',
 'be',
 'for',
 'horse',
 'horseshoe',
 'kingdom',
 'lose',
 'message',
 'nail',
 'of',
 'rider',
 'shoe',
 'the',
 'want']

In [43]:
#bigrams
count_vect = CountVectorizer(ngram_range=(2, 2))

In [46]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [47]:
#stopwords
from nltk.corpus import stopwords

In [48]:
stop_words = set(stopwords.words('english'))

In [50]:
count_vect = CountVectorizer(stop_words=stop_words)

In [54]:
bow2 = count_vect.fit_transform(corpus)


In [53]:
print(bow2.toarray())

[[0 0 0 0 1 0 1 0 1 1]
 [0 1 0 0 1 0 0 0 1 1]
 [0 1 0 0 1 0 0 1 0 1]
 [0 0 0 0 1 1 0 1 0 1]
 [1 0 0 0 1 1 0 0 0 1]
 [1 0 0 1 1 0 0 0 0 1]
 [0 0 1 0 0 0 1 0 0 1]]


In [55]:
count_vect.get_feature_names()

['battle',
 'horse',
 'horseshoe',
 'kingdom',
 'lose',
 'message',
 'nail',
 'rider',
 'shoe',
 'want']

In [None]:
import pandas as pd

# < write code here >
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

data = pd.read_csv('/datasets/imdb_reviews_small_lemm.tsv', sep='\t')
corpus = data['review_lemm']

# create a bag-of-words without checking for stop words
# < write code here >
count_vect = CountVectorizer()
bow = count_vect.fit_transform(corpus)
print("The BoW size with stop words:", bow.shape)

# create a bag-of-words with checking for stop words
# < write code here >
stop_words = set(stopwords.words('english'))
count_vect = CountVectorizer(stop_words=stop_words)
bow = count_vect.fit_transform(corpus)

print("The BoW size without stop words:", bow.shape)

In [None]:
import pandas as pd

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer


data = pd.read_csv('/datasets/imdb_reviews_small_lemm.tsv', sep='\t')
corpus = data['review_lemm']

# create an n-gram with n=2 and store it in the n_gram variable

# < write code here >
count_vect = CountVectorizer(ngram_range=(2, 2))
n_gram = count_vect.fit_transform(corpus)

print("The size of 2-gram:", n_gram.shape)

### TF-IDF in sklearn

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
stop_words = set(stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stop_words)

In [58]:
tf_idf = count_tf_idf.fit_transform(corpus)

In [20]:
import pandas as pd

from nltk.corpus import stopwords as nltk_stopwords

# import TfidfVectorizer
# < write code here >
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('datasets/imdb_reviews_small_lemm_train.tsv', sep='\t')
corpus = data['review_lemm']

stop_words = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stop_words)

tf_idf = count_tf_idf.fit_transform(corpus)
# < write code here >

print("The TF-IDF matrix size:", tf_idf.shape)

The TF-IDF matrix size: (2027, 18036)


In [21]:
corpus.shape

(2027,)

In [2]:
import pandas as pd

from nltk.corpus import stopwords as nltk_stopwords

# import TfidfVectorizer
# < write code here >
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('datasets/imdb_reviews_small_lemm_train.tsv', sep='\t')
corpus = data['review_lemm']
corpus

0       i see this movie last year in medium class and...
1       i must admit there be few book with correspond...
2       i think that the shot and light be very poor w...
3       a few week ago i read the classic george orwel...
4       i see this movie literally directly after fini...
                              ...                        
2022    director douglas sirk score again with this th...
2023    spoiler spoiler release in and consider quite ...
2024    fabulous film rent the dvd recently and be flo...
2025    rich alcoholic robert stack fall in love with ...
2026    director douglas sirk once say ` there be a ve...
Name: review_lemm, Length: 2027, dtype: object

In [4]:
stop_words = set(nltk_stopwords.words('english'))

In [5]:
count_tf_idf = TfidfVectorizer(stop_words=stop_words)

tf_idf = count_tf_idf.fit_transform(corpus)

print("The TF-IDF matrix size:", tf_idf.shape)

The TF-IDF matrix size: (2027, 18036)


In [16]:
tf_idf

<2027x18036 sparse matrix of type '<class 'numpy.float64'>'
	with 196664 stored elements in Compressed Sparse Row format>

In [6]:
data.columns

Index(['tconst', 'original_title', 'review', 'review_lemm', 'pos'], dtype='object')

In [8]:
data['pos']

0       0
1       0
2       0
3       0
4       0
       ..
2022    1
2023    1
2024    1
2025    1
2026    1
Name: pos, Length: 2027, dtype: int64

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

target = data['pos']
#features = corpus
features_train, features_valid, target_train, target_valid = train_test_split(
    tf_idf, target, test_size=0.2, random_state=0)

In [18]:
features_train

<1621x18036 sparse matrix of type '<class 'numpy.float64'>'
	with 159016 stored elements in Compressed Sparse Row format>

In [19]:
target_valid

1254    0
898     0
1187    0
37      1
1200    0
       ..
1359    1
516     0
122     0
641     1
457     1
Name: pos, Length: 406, dtype: int64

In [28]:
model = LogisticRegression(random_state=0, solver='liblinear')
model.fit(tf_idf, target)

LogisticRegression(random_state=0, solver='liblinear')

In [None]:
test_set = pd.read_csv('datasets/imdb_reviews_small_lemm_test.tsv', sep='\t')

In [31]:
test_target = test_set['review_lemm']

In [32]:
test_tf_idf = count_tf_idf.transform(test_target)

print("The TF-IDF matrix size:", test_tf_idf.shape)

The TF-IDF matrix size: (2220, 18036)


In [33]:
pred = model.predict(test_tf_idf)

In [35]:
pred.shape

(2220,)

In [36]:
test_set['pos'] = pred

In [37]:
test_set

Unnamed: 0,tconst,original_title,review,review_lemm,pos
0,tt0108999,...And the Earth Did Not Swallow Him,I rented this movie from a local library witho...,i rent this movie from a local library without...,0
1,tt0108999,...And the Earth Did Not Swallow Him,"The movie "". . . And The Earth Did not Swallow...",the movie and the earth do not swallow -PRON- ...,1
2,tt0108999,...And the Earth Did Not Swallow Him,I was very moved by the young life experiences...,i be very move by the young life experience of...,1
3,tt0108999,...And the Earth Did Not Swallow Him,"Recently finally available in DVD (11/11/08), ...",recently finally available in dvd severo p rez...,1
4,tt0063308,"Un minuto per pregare, un istante per morire",I saw this movie over 20 years ago and had rat...,i see this movie over year ago and have rather...,0
...,...,...,...,...,...
2215,tt0472278,Vampire Assassin,"Ron Hall pulls a triple threat as he writes, d...",ron hall pull a triple threat as -PRON- write ...,0
2216,tt0832971,Vanaja,Winning 26 out of the 28 awards it was nominat...,win out of the award -PRON- be nominate for th...,1
2217,tt0832971,Vanaja,Vanaja is a film of superlatives. It has an ex...,vanaja be a film of superlative -PRON- have an...,1
2218,tt0832971,Vanaja,This is not your typical Indian film. There is...,this be not -PRON- typical indian film there b...,1


In [38]:
test_set.to_csv('predictions')

## BERT & Preprocessing

In [8]:
import numpy as np
import torch
import transformers

In [9]:
#initialize the tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [10]:
#Convert the text into IDs of tokens, and the BERT tokenizer will return IDs of tokens rather than tokens
example = 'It is very handy to use transformers'
ids = tokenizer.encode(example, add_special_tokens=True)
print(ids)

[101, 2009, 2003, 2200, 18801, 2000, 2224, 19081, 102]


In [11]:
#BERT accepts vectors of a fixed length, e.g. of 512 tokens. 
n = 512

padded = np.array(ids[:n] + [0]*(n - len(ids)))

print(padded)

[  101  2009  2003  2200 18801  2000  2224 19081   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [15]:
#create a mask for the important tokens, indicating zero and non-zero values
#zeros do not carry significant information
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)

(512,)


In [17]:
attention_mask

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [19]:
import pandas as pd
import numpy as np
import torch
import transformers

data = pd.read_csv('datasets/imdb_reviews_small_lemm_train.tsv', sep='\t')

# initializing tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# texts to tokens
text = 'It is very handy to use transformers'
ids = tokenizer.encode(text.lower(), add_special_tokens=True)
ids = data['review'].apply(
 lambda x: tokenizer.encode(x.lower(), 
                             add_special_tokens=True, truncation=True, 
                             max_length=512))

# padding (appending zero's to the vector to make its length equal to n)
n = 512
padded_ids = []
for each in ids:
    padded = np.array(each[:n] + [0]*(n - len(each)))
    padded_ids.append(padded)

# creating the attention mask to distinguish tokens we are interested in
attention_mask = np.where(padded != 0, 1, 0)

In [33]:
attention_mask = np.where(padded_ids != 0, 1, 0)

In [34]:
print(attention_mask)

1


In [32]:
print(ids[2026])

[101, 2472, 5203, 2909, 2243, 2320, 2056, 1036, 2045, 1005, 1055, 1037, 2200, 2460, 3292, 2090, 2152, 2396, 1998, 11669, 1010, 1998, 11669, 2008, 3397, 13675, 16103, 2791, 2003, 2011, 2023, 2200, 3737, 20388, 2000, 2396, 1005, 1012, 2023, 4861, 11859, 2010, 5988, 6669, 1010, 1037, 2200, 4310, 2303, 1997, 2147, 2008, 2950, 4438, 2754, 17241, 1010, 6172, 1998, 2162, 3152, 1010, 2530, 2015, 1998, 1997, 2607, 1010, 2010, 3297, 11463, 7716, 14672, 2015, 1012, 2909, 2243, 1005, 1055, 11463, 7716, 14672, 2015, 2020, 1010, 2004, 1996, 2200, 2773, 27353, 1010, 16547, 2007, 2189, 1012, 1996, 2189, 4520, 1996, 4309, 2005, 2010, 3040, 3993, 2806, 1010, 1998, 2296, 6909, 1997, 2010, 8248, 1006, 2909, 2243, 2001, 2036, 1037, 5276, 1007, 3727, 1037, 3928, 3746, 2006, 1996, 3898, 1011, 2357, 1011, 10683, 1012, 2021, 2023, 7110, 1005, 1056, 2166, 2021, 2049, 6630, 1010, 2019, 20017, 1997, 2166, 1012, 2909, 2243, 2196, 2699, 2000, 2265, 4507, 1010, 2006, 1996, 10043, 1012, 3904, 1997, 1996, 5501, 1997, 

###  BERT Embeddings

In [35]:
corpus = data['review_lemm']

In [36]:
corpus

0       i see this movie last year in medium class and...
1       i must admit there be few book with correspond...
2       i think that the shot and light be very poor w...
3       a few week ago i read the classic george orwel...
4       i see this movie literally directly after fini...
                              ...                        
2022    director douglas sirk score again with this th...
2023    spoiler spoiler release in and consider quite ...
2024    fabulous film rent the dvd recently and be flo...
2025    rich alcoholic robert stack fall in love with ...
2026    director douglas sirk once say ` there be a ve...
Name: review_lemm, Length: 2027, dtype: object

In [37]:
# list of vector IDs (padded) and the list of attention masks
ids_list = []
attention_mask_list = []

max_length = 512

for input_text in corpus[:200]:
    ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
    padded = np.array(ids + [0]*(max_length - len(ids)))
    attention_mask = np.where(padded != 0, 1, 0)
    ids_list.append(padded)
    attention_mask_list.append(attention_mask)

In [None]:
import transformers
config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model = transformers.BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…

In [39]:
#
#tqdm lilbrary displays progress of  operation
from tqdm.auto import tqdm

for i in tqdm(range(int(8e6))):
    pass

# the progress bar will appear

HBox(children=(FloatProgress(value=0.0, max=8000000.0), HTML(value='')))




In [None]:
#The BERT model creates embeddings in batches
batch_size = 100

In [None]:
#a loop for the batches
# creating an empty list of review embeddings 
embeddings = []

for i in tqdm(range(len(ids_list) // batch_size)):
    ...

In [None]:
#Transform the data into a tensor format. 
# putting together vectors of ids (of tokens) to a tensor
ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)])
# putting together vectors of attention masks to a tensor
attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)])

In [None]:
#Pass the data and the mask to the model to obtain embeddings for the batch:
batch_embeddings = model(ids_batch, attention_mask=attention_mask_batch)

In [None]:
#Use the no_grad() (no gradient) function to indicate that we don't need gradients in the torch library 
#it will make calculations faster
with torch.no_grad():
    batch_embeddings = model(ids_batch, attention_mask=attention_mask_batch)

In [None]:
#Extract the required elements from the tensor and add the list of all the embeddings:
# converting elements of tensor to numpy.array with the numpy() function
embeddings.append(batch_embeddings[0][:,0,:].numpy())

In [None]:
#Putting all the above together, we get this loop:
batch_size = 100

embeddings = []

for i in tqdm(range(len(ids_list) // batch_size)):
    
    ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)])
    attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)])

    with torch.no_grad():
        batch_embeddings = model(ids_batch, attention_mask=attention_mask_batch)

    embeddings.append(batch_embeddings[0][:,0,:].numpy())

In [None]:
#concatenate all the embeddings in a matrix of features:
features = np.concatenate(embeddings)

### The features are ready. Time to train the model!

In [40]:
padded_ids[0][:,0,:]

IndexError: too many indices for array: array is 1-dimensional, but 3 were indexed

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score?

#### Classification task

In [None]:
import numpy as np
import pandas as pd

import torch
import transformers

from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split