In [1]:
import torch
import pandas as pd
import numpy as np
from TextCNN import TextCNN
import gzip
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from Embedder import Embedder

In [2]:
FILENAME = "movies.txt"
COLUMNS = ["productId",
           "userId",
           "profileName",
           "helpfulness",
           "score",
           "time",
           "summary",
           "text"]
COL_NUM = 8
REVIEW_NUM = 140000
PADDED_LEN = 200
GLOVE_BINARY_PATH = "glove.6B.50d.txt"

In [3]:
df = pd.read_csv(FILENAME,
                 header=None,
                 delimiter="\n",
                 error_bad_lines=False,
                 skip_blank_lines=True,
                 encoding="latin-1",
                 nrows=COL_NUM*REVIEW_NUM)
df = pd.DataFrame(np.reshape(df.values,(REVIEW_NUM,COL_NUM)),columns=COLUMNS)

In [4]:
df["productId"] = df["productId"].str.replace("product/productId:","")
for col in COLUMNS[1:]:
    df[col] = df[col].str.replace("review/" + col + ":","")

In [5]:
df.head()

Unnamed: 0,productId,userId,profileName,helpfulness,score,time,summary,text
0,B003AI2VGA,A141HP4LYPWMSR,"Brian E. Erland ""Rainbow Sphinx""",7/7,3.0,1182729600,"""There Is So Much Darkness Now ~ Come For The...","Synopsis: On the daily trek from Juarez, Mexi..."
1,B003AI2VGA,A328S9RN3U5M68,Grady Harp,4/4,3.0,1181952000,Worthwhile and Important Story Hampered by Po...,THE VIRGIN OF JUAREZ is based on true events ...
2,B003AI2VGA,A1I7QGUDP043DG,"Chrissy K. McVay ""Writer""",8/10,5.0,1164844800,This movie needed to be made.,The scenes in this film can be very disquieti...
3,B003AI2VGA,A1M5405JH9THP9,golgotha.gov,1/1,3.0,1197158400,distantly based on a real tragedy,THE VIRGIN OF JUAREZ (2006)<br />directed by ...
4,B003AI2VGA,ATXL536YX71TR,"KerrLines ""&#34;Movies,Music,Theatre&#34;""",1/1,3.0,1188345600,"""What's going on down in Juarez and shining a...","Informationally, this SHOWTIME original is es..."


In [6]:
X = df["text"]
y = df["score"].astype("float").astype("int").values.reshape([-1,1])
y_one_hot = OneHotEncoder().fit_transform(y).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
#df[df["score"] == "review/helpfulness: 0/0"]


In [8]:
embedder = Embedder(None,50,PADDED_LEN,GLOVE_BINARY_PATH)
X_embedded = embedder.str_series_to_image(X)

140000


In [9]:
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader

In [10]:
LEARNING_RATE = 0.025
FC_LAYER = 100
CLASSES_LEN = 5
EMBEDDING_DIM = 50
CONV_FILTERS = 512
EPOCHS = 100
TRAIN_TEST_RATION = 0.15
TRAIN_SIZE = (1 - TRAIN_TEST_RATION) * len(X_embedded)
TEST_SIZE = (TRAIN_TEST_RATION) * len(X_embedded)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y_one_hot, test_size=TRAIN_TEST_RATION)
X_train = X_train.reshape([-1,1,PADDED_LEN,EMBEDDING_DIM])
X_test = X_test.reshape([-1,1,PADDED_LEN,EMBEDDING_DIM])
X_train_tensor = torch.Tensor(X_train).to(device)
X_test_tensor = torch.Tensor(X_test).to(device)
y_train_tensor = torch.Tensor(y_train).long().to(device)
y_test_tensor = torch.Tensor(y_test).long().to(device)
#y_train_tensor = torch.Tensor(y_train).to(device)
#y_test_tensor = torch.Tensor(y_test).to(device)

train_dataset = TensorDataset(X_train_tensor,y_train_tensor)
test_dataset = TensorDataset(X_test_tensor,y_test_tensor)
trainloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [13]:
model = TextCNN(hidden_units=FC_LAYER,
                output_len=CLASSES_LEN,
                textcnn_filter_count=CONV_FILTERS,
                sentence_max_size=PADDED_LEN,
                word_embedding_dimension=EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
#criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),weight_decay=1e-5)
model.to(device)

TextCNN(
  (conv3): Conv2d(1, 512, kernel_size=(3, 50), stride=(1, 1))
  (conv4): Conv2d(1, 512, kernel_size=(4, 50), stride=(1, 1))
  (conv5): Conv2d(1, 512, kernel_size=(5, 50), stride=(1, 1))
  (Max3_pool): MaxPool2d(kernel_size=(198, 1), stride=(198, 1), padding=0, dilation=1, ceil_mode=False)
  (Max4_pool): MaxPool2d(kernel_size=(197, 1), stride=(197, 1), padding=0, dilation=1, ceil_mode=False)
  (Max5_pool): MaxPool2d(kernel_size=(196, 1), stride=(196, 1), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1536, out_features=100, bias=True)
  (dropout1): Dropout(p=0.5)
  (linear1): Linear(in_features=100, out_features=5, bias=True)
)

In [14]:
X_train.shape

(119000, 1, 200, 50)

In [15]:
def check_accuracy_classification(data_loader,model,name):
    correct = 0
    total = 0
    total = len(data_loader)
    testloader.dataset.tensors[0]
    with torch.no_grad():
        for data in data_loader:
            images, labels = data
            outputs = model(images)
            labels = torch.max(labels, 1)[1]
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the '+name +' reviews: %d %%' % (100 * correct / total))


In [16]:
def check_accuracy_mse(data_loader,model,name):
    batch_losses = []
    with torch.no_grad():
        for data in data_loader:
            images, labels = data
            outputs = model(images)
            loss = criterion(outputs, labels)
            batch_losses.append(loss.item())
    #print(batch_losses)
    mse = np.array(batch_losses).mean()
    print('MSE of the network on the '+name +' reviews: %f' % mse)

In [27]:
for epoch in range(EPOCHS):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        # forward + backward + optimize
        outputs = model(inputs)
        #loss = criterion(outputs,labels)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        loss.backward()
        optimizer.step()
        # zero the parameter gradients
        optimizer.zero_grad()
        # print statistics
        running_loss += loss.item()
        #if i % 100 == 0:    # print every 2000 mini-batches
    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2))
    #print('epoch: [%d] loss: %.3f' % (epoch + 1, running_loss))
    check_accuracy_classification(trainloader,model,"train")
    check_accuracy_classification(testloader,model,"test")
#     check_accuracy_mse(trainloader,model,"train")
#     check_accuracy_mse(testloader,model,"test")
print('Finished Training')

[1,   117] loss: 50.317
Accuracy of the network on the train reviews: 63 %
Accuracy of the network on the test reviews: 62 %
[2,   117] loss: 41.768
Accuracy of the network on the train reviews: 73 %
Accuracy of the network on the test reviews: 72 %
[3,   117] loss: 38.650
Accuracy of the network on the train reviews: 74 %
Accuracy of the network on the test reviews: 73 %
[4,   117] loss: 36.546
Accuracy of the network on the train reviews: 74 %
Accuracy of the network on the test reviews: 71 %
[5,   117] loss: 35.326
Accuracy of the network on the train reviews: 76 %
Accuracy of the network on the test reviews: 72 %
[6,   117] loss: 34.415
Accuracy of the network on the train reviews: 77 %
Accuracy of the network on the test reviews: 73 %
[7,   117] loss: 33.578
Accuracy of the network on the train reviews: 77 %
Accuracy of the network on the test reviews: 73 %
[8,   117] loss: 32.747
Accuracy of the network on the train reviews: 78 %
Accuracy of the network on the test reviews: 73 %


KeyboardInterrupt: 

In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = model(images)
        labels = torch.max(labels, 1)[1]
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the '+ str(TEST_SIZE) +' test reviews: %d %%' % (100 * correct / total))


In [None]:
correct = 0
total = 0
with torch.no_grad():
    for data in trainloader:
        images, labels = data
        outputs = model(images)
        labels = torch.max(labels, 1)[1]
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy of the network on the ' + str(TRAIN_SIZE) + ' train reviews: %d %%' % (100 * correct / total))


In [56]:
reviews = ["the movie was okay at best","the movie was trash","this is the greatest thing I have ever seen!"]
with torch.no_grad():
    for review in reviews:
        review_data = torch.Tensor(embedder.str_to_image(review).reshape([-1,1,PADDED_LEN,EMBEDDING_DIM])).long().to(device)
        res = model(review_data)
        print("review: " + review + ". rank: " + str(torch.argmax(res).item() + 1))

review: the movie was okay at best. rank: 3
review: the movie was trash. rank: 1
review: this is the greatest thing I have ever seen!. rank: 5


In [43]:
torch.max(labels, 1)[1]

tensor([4, 4, 1,  ..., 4, 1, 3])

In [None]:
model(inputs)
