In [1]:
#Data Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation

#Pytorch
import torch
from torch import nn,optim

In [2]:
# Reding the txt files
with open('Data/reviews.txt','r') as f:
    reviews = f.read()
with open('Data/labels.txt','r') as f:
    labels = f.read()

In [3]:
print(reviews[:200])
print('......')
print(labels[:62])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  
......
positive
negative
positive
negative
positive
negative
positive


In [4]:
reviews = reviews.lower()
labels = labels.lower()

### Data Pre-processing

In [5]:
#Removing Puntuations
reviews = ''.join([c for c in reviews if c not in punctuation])

print('\n' in reviews)

#creating a list of reviews
reviews_split = reviews.split('\n')
reviews= ''.join(reviews_split)

print('\n' in reviews)

# Creating a list of all the words in the file.
words = reviews.split()
words[:10]

True
False


['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

Encoding the words

In [6]:
from collections import Counter
# Getting the words in the most frequent order
c= Counter(words)
vocab = sorted(c, key=c.get,reverse=True)
#Creatng a vocab_to_int dictionary for future reference
vocab_to_int = {word:i for i,word in enumerate(vocab,1)}
print('Unique Words:',len(vocab_to_int))

#Creating a list of indices for each word in each review
review_ints = []
for review in reviews_split:
    review_ints.append([vocab_to_int[word] for word in review.split()])


Unique Words: 74072


In [7]:
#Encoding the labels
labels = labels.split('\n')
encoded_labels = [0 if i =='negative' else 1 for i in labels]
encoded_labels[:10]

[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

In [8]:
print(len(review_ints))
print(len(encoded_labels))

25001
25001


In [9]:
# Removing Zero length reviews
print(f'no. of reviews before the processing:{len(review_ints)}')

zero_len_idx = [i for i,review in enumerate(review_ints) if len(review) == 0]
review_ints = [review_ints[i] for i in range(len(review_ints)) if i not in zero_len_idx]
encoded_labels = [encoded_labels[i] for i in range(len(encoded_labels)) if i not in zero_len_idx]

print(f'no. of reviews after the processing:{len(review_ints)}')

no. of reviews before the processing:25001
no. of reviews after the processing:25000


In [14]:
#Padding the reviews so that, all reviews have same lengths.
def pad_features(review_ints, seq_length=200):
    features=[]
    for review in review_ints:   
        if len(review)<=seq_length:
            temp = [0]*seq_length
            temp[seq_length-len(review):] = review
            features.append(temp)
        elif len(review) > seq_length:
            features.append(review[:seq_length])

    return features


In [15]:
test_review_ints = [[1,2,3],[1,2,3,4,5,6,7,8,9,10]]
temp_seq_len = 5
temp_features = pad_features(test_review_ints,temp_seq_len)
print([i for i in temp_features])

[[0, 0, 1, 2, 3], [1, 2, 3, 4, 5]]


In [16]:
seq_len=250

features = pad_features(review_ints=review_ints,seq_length=seq_len)
print({len(review_ints)})
print(len(features))

25000
25000
