## Dataloading

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import re
import sys
import math
import numpy as np
import torch
import torch.utils.data as data

from matplotlib import pyplot as plt

from collections import Counter

import json
import pickle
import pandas as pd
from tqdm import tqdm

from string import punctuation

### Load data

In [None]:
mypath = 'data/train'

f = []
labels = []
for sentiment in ['pos','neg']:
    path = mypath+'/'+sentiment
    for (dirpath, dirnames, filenames) in os.walk(path):
        f.extend([path + '/' + filename for filename in filenames])
        if sentiment == 'pos':
            label = 1
        else:
            label = 0
        labels.extend([label for i in range(len(filenames))])
        

reviews = []

for file in tqdm(f):
    with open(file, 'r', encoding='utf-8') as fb:
        reviews.append(fb.read())
        
reviews = '\n'.join(reviews)

In [None]:
reviews = reviews.lower()
all_text = ''.join([c for c in reviews if c not in punctuation])

In [None]:
# remove annoying characters
chars = {
    '\xc2\x82' : '',        # High code comma
    '\xc2\x84' : '',       # High code double comma
    '\xc2\x85' : '',      # Tripple dot
    '\xc2\x88' : '',        # High carat
    '\xc2\x91' : '',     # Forward single quote
    '\xc2\x92' : '',     # Reverse single quote
    '\xc2\x93' : '',     # Forward double quote
    '\xc2\x94' : '',     # Reverse double quote
    '\xc2\x95' : '',
    '\xc2\x96' : '',        # High hyphen
    '\xc2\x97' : '',       # Double hyphen
    '\xc2\x99' : '',
    '\xc2\xa0' : '',
    '\xc2\xa6' : '',        # Split vertical bar
    '\xc2\xab' : '',       # Double less than
    '\xc2\xbb' : '',       # Double greater than
    '\xc2\xbc' : '',      # one quarter
    '\xc2\xbd' : '',      # one half
    '\xc2\xbe' : '',      # three quarters
    '\xca\xbf' : '',     # c-single quote
    '\xcc\xa8' : '',         # modifier - under curve
    '\xcc\xb1' : ''          # modifier - under line
}

all_text = ''.join([i if ord(i) < 128 else ' ' for i in all_text])

In [None]:
reviews = all_text.split('\n')

### Create Vocabulary

In [None]:
all_words = ' '.join(reviews)
all_words = re.sub('\\[x]\w\w', '', all_words)
word_list = list(set(all_words.split(' ')))
word_counter = Counter(all_words.split(' '))
total_words = len(word_list)
sorted_words = word_counter.most_common(total_words)
word_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

### Tokenize reviews

In [None]:
tokenized_reviews = []
for review in reviews:
    tokenized_reviews.append([word_to_int[w] for w in review.split()])
    
print(tokenized_reviews[:4])

In [None]:
reviews_len = [len(x) for x in tokenized_reviews]
lens = pd.Series(reviews_len)
lens.hist()
plt.show()
lens.describe()