In [1]:
import torch
import pandas as pd
import numpy as np
from TextCNN import TextCNN
import gzip
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from Embedder import Embedder

In [2]:
FILENAME = "movies.txt"
COLUMNS = ["productId",
           "userId",
           "profileName",
           "helpfulness",
           "score",
           "time",
           "summary",
           "text"]
COL_NUM = 8
REVIEW_NUM = 25000
PADDED_LEN = 200
GLOVE_BINARY_PATH = "glove.6B.50d.txt"

In [3]:
df = pd.read_csv(FILENAME,
                 header=None,
                 delimiter="\n",
                 error_bad_lines=False,
                 skip_blank_lines=True,
                 encoding="latin-1",
                 nrows=COL_NUM*REVIEW_NUM)
df = pd.DataFrame(np.reshape(df.values,(REVIEW_NUM,COL_NUM)),columns=COLUMNS)

In [4]:
df["productId"] = df["productId"].str.replace("product/productId:","")
for col in COLUMNS[1:]:
    df[col] = df[col].str.replace("review/" + col + ":","")

In [5]:
df.head()

Unnamed: 0,productId,userId,profileName,helpfulness,score,time,summary,text
0,B003AI2VGA,A141HP4LYPWMSR,"Brian E. Erland ""Rainbow Sphinx""",7/7,3.0,1182729600,"""There Is So Much Darkness Now ~ Come For The...","Synopsis: On the daily trek from Juarez, Mexi..."
1,B003AI2VGA,A328S9RN3U5M68,Grady Harp,4/4,3.0,1181952000,Worthwhile and Important Story Hampered by Po...,THE VIRGIN OF JUAREZ is based on true events ...
2,B003AI2VGA,A1I7QGUDP043DG,"Chrissy K. McVay ""Writer""",8/10,5.0,1164844800,This movie needed to be made.,The scenes in this film can be very disquieti...
3,B003AI2VGA,A1M5405JH9THP9,golgotha.gov,1/1,3.0,1197158400,distantly based on a real tragedy,THE VIRGIN OF JUAREZ (2006)<br />directed by ...
4,B003AI2VGA,ATXL536YX71TR,"KerrLines ""&#34;Movies,Music,Theatre&#34;""",1/1,3.0,1188345600,"""What's going on down in Juarez and shining a...","Informationally, this SHOWTIME original is es..."


In [6]:
X = df["text"]
y = df["score"].astype("float").astype("int").values.reshape([-1,1])
y_one_hot = OneHotEncoder().fit_transform(y).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
#df[df["score"] == "review/helpfulness: 0/0"]


In [8]:
embedder = Embedder(None,50,PADDED_LEN,GLOVE_BINARY_PATH)
X_embedded = embedder.str_series_to_image(X)

25000


In [9]:
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader

In [10]:
LEARNING_RATE = 0.025
FC_LAYER = 512
CLASSES_LEN = 5
EMBEDDING_DIM = 50
CONV_FILTERS = 128
EPOCHS = 50
TRAIN_TEST_RATION = 0.2
TRAIN_SIZE = (1 - TRAIN_TEST_RATION) * len(X_embedded)
TEST_SIZE = (TRAIN_TEST_RATION) * len(X_embedded)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y_one_hot, test_size=TRAIN_TEST_RATION)
X_train = X_train.reshape([-1,1,EMBEDDING_DIM,PADDED_LEN])
X_test = X_test.reshape([-1,1,EMBEDDING_DIM,PADDED_LEN])
X_train_tensor = torch.Tensor(X_train).to(device)
X_test_tensor = torch.Tensor(X_test).to(device)
y_train_tensor = torch.Tensor(y_train).long().to(device)
y_test_tensor = torch.Tensor(y_test).long().to(device)
#y_train_tensor = torch.Tensor(y_train).to(device)
#y_test_tensor = torch.Tensor(y_test).to(device)

train_dataset = TensorDataset(X_train_tensor,y_train_tensor)
test_dataset = TensorDataset(X_test_tensor,y_test_tensor)
trainloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [13]:
model = TextCNN(hidden_units=FC_LAYER,
                output_len=CLASSES_LEN,
                textcnn_filter_count=CONV_FILTERS,
                sentence_max_size=PADDED_LEN,
                word_embedding_dimension=EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
#criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-2, weight_decay=1e-5)
model.to(device)

TextCNN(
  (conv3): Conv2d(1, 128, kernel_size=(50, 3), stride=(1, 1))
  (conv4): Conv2d(1, 128, kernel_size=(50, 4), stride=(1, 1))
  (conv5): Conv2d(1, 128, kernel_size=(50, 5), stride=(1, 1))
  (conv6): Conv2d(1, 128, kernel_size=(50, 8), stride=(1, 1))
  (conv7): Conv2d(1, 128, kernel_size=(50, 11), stride=(1, 1))
  (conv8): Conv2d(1, 128, kernel_size=(50, 20), stride=(1, 1))
  (Avg3_pool): AvgPool2d(kernel_size=(1, 198), stride=(1, 198), padding=0)
  (Avg4_pool): AvgPool2d(kernel_size=(1, 197), stride=(1, 197), padding=0)
  (Avg5_pool): AvgPool2d(kernel_size=(1, 196), stride=(1, 196), padding=0)
  (Avg6_pool): AvgPool2d(kernel_size=(1, 193), stride=(1, 193), padding=0)
  (Avg7_pool): AvgPool2d(kernel_size=(1, 190), stride=(1, 190), padding=0)
  (Avg8_pool): AvgPool2d(kernel_size=(1, 180), stride=(1, 180), padding=0)
  (Max3_pool): MaxPool2d(kernel_size=(1, 198), stride=(1, 198), padding=0, dilation=1, ceil_mode=False)
  (Max4_pool): MaxPool2d(kernel_size=(1, 197), stride=(1, 197),

In [14]:
def check_accuracy_classification(data_loader,model,name):
    correct = 0
    total = 0
    total = len(data_loader)
    testloader.dataset.tensors[0]
    with torch.no_grad():
        for data in data_loader:
            images, labels = data
            outputs = model(images)
            labels = torch.max(labels, 1)[1]
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the '+name +' reviews: %d %%' % (100 * correct / total))


In [15]:
def check_accuracy_mse(data_loader,model,name):
    batch_losses = []
    with torch.no_grad():
        for data in data_loader:
            images, labels = data
            outputs = model(images)
            loss = criterion(outputs, labels)
            batch_losses.append(loss.item())
    #print(batch_losses)
    mse = np.array(batch_losses).mean()
    print('MSE of the network on the '+name +' reviews: %f' % mse)

In [16]:
for epoch in range(EPOCHS):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        # forward + backward + optimize
        outputs = model(inputs)
        #loss = criterion(outputs,labels)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()
        # zero the parameter gradients
        optimizer.zero_grad()
        # print statistics
        running_loss += loss.item()
        #if i % 100 == 0:    # print every 2000 mini-batches
    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2))
    #print('epoch: [%d] loss: %.3f' % (epoch + 1, running_loss))
    check_accuracy_classification(trainloader,model,"train")
    check_accuracy_classification(testloader,model,"test")
#     check_accuracy_mse(trainloader,model,"train")
#     check_accuracy_mse(testloader,model,"test")
print('Finished Training')

  x = F.log_softmax(self.linear1(x))


[1,    20] loss: 14.999
Accuracy of the network on the train reviews: 55 %
Accuracy of the network on the test reviews: 54 %
[2,    20] loss: 12.359
Accuracy of the network on the train reviews: 56 %
Accuracy of the network on the test reviews: 55 %
[3,    20] loss: 12.137
Accuracy of the network on the train reviews: 56 %
Accuracy of the network on the test reviews: 55 %
[4,    20] loss: 11.675
Accuracy of the network on the train reviews: 57 %
Accuracy of the network on the test reviews: 55 %
[5,    20] loss: 11.019
Accuracy of the network on the train reviews: 58 %
Accuracy of the network on the test reviews: 55 %
[6,    20] loss: 10.079
Accuracy of the network on the train reviews: 62 %
Accuracy of the network on the test reviews: 56 %
[7,    20] loss: 9.153
Accuracy of the network on the train reviews: 65 %
Accuracy of the network on the test reviews: 56 %
[8,    20] loss: 8.163
Accuracy of the network on the train reviews: 70 %
Accuracy of the network on the test reviews: 54 %
[9

In [17]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = model(images)
        labels = torch.max(labels, 1)[1]
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the '+ str(TEST_SIZE) +' test reviews: %d %%' % (100 * correct / total))


Accuracy of the network on the 5000.0 test reviews: 56 %


In [18]:
correct = 0
total = 0
with torch.no_grad():
    for data in trainloader:
        images, labels = data
        outputs = model(images)
        labels = torch.max(labels, 1)[1]
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the ' + str(TRAIN_SIZE) + ' train reviews: %d %%' % (100 * correct / total))


Accuracy of the network on the 20000.0 train reviews: 97 %


In [19]:
#review = r"Informationally, this SHOWTIME original is essential viewing for enlightening audiences on the dangerous plight of women migrant workers from Los Angeles who are routinely and mysteriously murdered on the border between Juarez, Mexico and El Paso, Texas.Since 1993, hundreds of woman have turned up dead and mutilated and no one seems to either care or have an answer. The fear and death continue.<br /><br />Director Kevin Dobson has brought us Michael Fallon's screenplay THE VIRGIN OF JUAREZ to the screen as a fanciful/possible solution to what may be happening down in Juarez.Though this is but a film, the information about the unsolved murders and the religious fanaticism in the area is crucial.<br /><br />Suggested companion film would be AGNES OF GOD."
review = "the movie was perfect"
with torch.no_grad():
    res = model(torch.Tensor(embedder.str_to_image(review).reshape([1,1,EMBEDDING_DIM,PADDED_LEN])).long().to(device))
    print(res)

ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 512])

In [None]:
torch.max(labels, 1)[1]

In [None]:
model(inputs)
