In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import re

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm_notebook

# Load Data

In [33]:
def string_clean(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " ", string) 
    string = re.sub(r"\'ve", " ", string) 
    string = re.sub(r"n\'t", " ", string) 
    string = re.sub(r"\'re", " ", string) 
    string = re.sub(r"\'d", " ", string) 
    string = re.sub(r"\'ll", " ", string) 
    string = re.sub(r",", " ", string) 
    string = re.sub(r"!", " ", string) 
    string = re.sub(r"\(", " ", string) 
    string = re.sub(r"\)", " ", string) 
    string = re.sub(r"\?", " ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

In [128]:
# Load positive and negative data
embedding_dict, word_idx = {}, {}
with open("glove/glove.6B.300d.txt", 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        word_idx[word] = i+1
        embedding_dict[i+1] = vector

In [34]:
# Load Data
tf_threshold = 5

positive_data, negative_data, tf = [], [], {}
with open("Data/rt-polarity.pos", 'r',encoding = "ISO-8859-1") as f:
    for line in f:
        line = string_clean(line)
        words = line.split(" ")
        for word in words:
            tf[word] = tf.get(word, 0) + 1
        positive_data.append(words)

with open("Data/rt-polarity.neg", 'r',encoding = "ISO-8859-1") as f:
    for line in f:
        line = string_clean(line)
        words = line.split(" ")
        for word in words:
            tf[word] = tf.get(word, 0) + 1
        negative_data.append(words)

positive_data = [[word for word in line if tf[word] > tf_threshold] for line in positive_data]
negative_data = [[word for word in line if tf[word] > tf_threshold] for line in negative_data]

In [124]:
max([len(doc) for doc in negative_data])

49

In [181]:
cnn1 = nn.Conv2d(1, 100, (3, 300), 1, padding=(1,0))

In [200]:
example = [embedding_dict[word_idx[word]] for word in positive_data[0] if word in word_idx]
example += [np.zeros(300, dtype=np.float32)] * (50 - len(example))

In [201]:
example = np.vstack(example)

In [174]:
example.shape

(50, 300)

In [203]:
example = torch.tensor([example])

In [205]:
example.shape

torch.Size([1, 50, 300])

In [176]:
example.view(1, 1, example.shape[0], -1)

tensor([[[[ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
          [-0.1492,  0.0212, -0.3424,  ...,  0.6468, -0.3724, -0.0851],
          [-0.1749,  0.2296,  0.2492,  ..., -0.2413, -0.4040,  0.0547],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]])

In [182]:
cnn1_out = cnn1(example.view(1, 1, example.shape[0], -1))

In [183]:
cnn1_out.shape

torch.Size([1, 100, 50, 1])

In [184]:
max_pool1 = nn.MaxPool2d((50, 1), 1)

In [194]:
max_pool1(cnn1_out).squeeze()

tensor([ 0.1212,  0.4196,  0.2795,  0.3341,  0.2226,  0.6094,  0.1546,  0.2416,
         0.6674,  0.4087,  0.5708,  0.4205,  0.3769,  0.1018,  0.1418,  0.3450,
         0.1363,  0.2967,  0.3514,  0.3392,  0.6232,  0.3852,  0.3943,  0.6143,
         0.4259,  0.2466,  0.3566,  0.3468,  0.3936,  0.1709,  0.6027,  0.1455,
         0.5588,  0.0249,  0.6435,  0.4619,  0.6818,  0.5475,  0.4456,  0.4555,
         0.5190,  0.2502,  0.3175,  0.2492,  0.1273,  0.2588,  0.3416,  0.6059,
        -0.0201,  0.1620,  0.2049,  0.5310,  0.3901,  0.2099,  0.1339,  0.2594,
         0.2276,  0.2788,  0.1737,  0.3962,  0.3584,  0.2871,  0.1587,  0.3157,
         0.1932,  0.3522,  0.3275,  0.1692,  0.1718,  0.5287,  0.1732,  0.2781,
         0.2962,  0.3043,  0.3447,  0.6436,  0.3131,  0.2282,  0.1747,  0.3347,
         0.4435,  0.3825,  0.4271,  0.3654,  0.3627,  0.1829,  0.3176,  0.4835,
         0.2383,  0.6155,  0.3340,  0.5330,  0.4106,  0.3284,  0.3735,  0.4090,
         0.2674,  0.4157,  0.2181,  0.46

In [223]:
class CnnRandom(nn.Module):
    def __init__(self, num_filters, filter_sizes, embedding_size, number_words, out_channels, doc_length, drop_rate=0.5):
        super(CnnRandom,self).__init__()
        assert num_filters == len(filter_sizes)
        self.embeddings = nn.Embedding(number_words+1, embedding_size)
        self.cnns = []
        self.pools = []
        for filter_size in filter_sizes:
            self.cnns.append(nn.Conv2d(1, 100, (filter_size, 300), 1))
            self.pools.append(nn.MaxPool2d((doc_length+1-filter_size, 1), 1))
        
        self.linear = nn.Linear(out_channels * num_filters, 1)
        self.drop = nn.Dropout(p=drop_rate)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, sents):
        X = self.embeddings(sents)
        X_features = []
        for cnn, pool in zip(self.cnns, self.pools):
            X_features.append(pool(cnn(X.view(X.shape[0], 1, X.shape[1], -1))).squeeze())        
        X = torch.cat(X_features, 0)
        X = self.drop(self.linear(X))
        return self.sigmoid(X)

In [226]:
cr = CnnRandom(3, [3,4,5], 300, len(embedding_dict), 100, 50)

In [227]:
example = [word_idx[word] for word in positive_data[0] if word in word_idx]
example += [0] * (50 - len(example))
example = np.vstack(example)
example = torch.tensor([example])
cr(example)

tensor([0.2004], grad_fn=<SigmoidBackward>)