In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
from data_loading_code import preprocess_pandas
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device is", device)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


device is cpu


In [2]:
#nltk.download('punkt')
#nltk.download('stopwords')
# get data, pre-process and split
data = pd.read_csv("amazon_cells_labelled.txt", delimiter='\t', header=None)
data.columns = ['Sentence', 'Class']
data['index'] = data.index                                          # add new column index
columns = ['index', 'Class', 'Sentence']
data = preprocess_pandas(data, columns)                             # pre-process
training_data, validation_data, training_labels, validation_labels = train_test_split( # split the data into training, validation, and test splits
    data['Sentence'].values.astype('U'),
    data['Class'].values.astype('int32'),
    test_size=0.10,
    random_state=0,
    shuffle=True
)

print(training_data)
print(type(training_data[0]))

# vectorize data using TFIDF and transform for PyTorch for scalability
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=50000, max_df=0.5, use_idf=True, norm='l2')
training_data = word_vectorizer.fit_transform(training_data)        # transform texts to sparse matrix
training_data = training_data.todense()                             # convert to dense matrix for Pytorch
vocab_size = len(word_vectorizer.vocabulary_)
validation_data = word_vectorizer.transform(validation_data)
validation_data = validation_data.todense()

train_x_tensor = torch.from_numpy(np.array(training_data)).type(torch.FloatTensor)
train_y_tensor = torch.from_numpy(np.array(training_labels)).long()
validation_x_tensor = torch.from_numpy(np.array(validation_data)).type(torch.FloatTensor)
validation_y_tensor = torch.from_numpy(np.array(validation_labels)).long()

['this allows the possibility of double booking for the same date and time after the first.'
 'my sister has one also and she loves it.'
 "the one big drawback of the mp player is that the buttons on the phone's front cover that let you pause and skip songs lock out after a few seconds."
 'the cutouts and buttons are placed perfectly.'
 'this is definitely a must have if your state does not allow cell phone usage while driving.'
 'these are fabulous!' 'nice sound.'
 "i can't use this case because the smell is disgusting."
 'i really like this product over the motorola because it is allot clearer on the ear piece and the mic.'
 'fast service.' 'i found this product to be waaay too big.'
 "it plays louder than any other speaker of this size; the price is so low that most would think the quality is lacking, however, it's not."
 'no buyers remorse on this one!.'
 'i had to go to a store and bought a new nokia phone which is working great.'
 'poor quality and service.'
 'the worst piece of 

In [3]:
print(train_x_tensor.shape)

torch.Size([900, 7277])


In [4]:
train_x_tensor[1].shape

torch.Size([7277])

In [5]:
print(data.iloc(0))

<pandas.core.indexing._iLocIndexer object at 0x000001E5BB763DE0>


In [6]:
dl = DataLoader(train_y_tensor, batch_size=5,num_workers=2)
it = iter(dl)
print(next(it))

tensor([0, 1, 0, 1, 1])


In [7]:
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, datasetA, datasetB):
        self.datasetA = datasetA
        self.datasetB = datasetB

    def __getitem__(self, i):
        inp = self.datasetA[i]
        label = F.one_hot(self.datasetB[i],num_classes=2)
        return inp,label

    def __len__(self):
        return min(len(self.datasetA),len(self.datasetB))

In [8]:
train_ds = ConcatDataset(train_x_tensor,train_y_tensor)
val_ds = ConcatDataset(validation_x_tensor,validation_y_tensor)
train_loader = DataLoader(train_ds,batch_size=5)
val_loader = DataLoader(val_ds,batch_size=5)

In [9]:
class ANN(nn.Module):
    def __init__(self,num_classes):
        super().__init__()
        
        self.act = nn.LeakyReLU()

        # input: 7277        
        self.fc1 = nn.Linear(in_features=7277, out_features=1000) 
        self.fc2 = nn.Linear(in_features=1000, out_features=100)
        self.fc3 = nn.Linear(in_features=100, out_features=10)
        self.fc4 = nn.Linear(in_features=10, out_features=2)
        self.Softmax = nn.Softmax(dim=1)


    
    def forward(self, x):
        #FC Layer 1
        x = self.fc1(x)
        x = self.act(x)

        #FC Layer 2
        x = self.fc2(x)   
        x = self.act(x)    

        #FC Layer 3
        x = self.fc3(x)   
        x = self.act(x)    

        #FC Layer 4
        x = self.fc4(x)    
        
        #Softmax
        out = self.Softmax(x)

        return out

In [10]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs):
    best_val_loss = float('inf')
    best_model = model

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1} of {num_epochs}")
        model.train()
        train_loss = 0.0
        for batch_nr, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device).to(torch.float)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss = loss.to(device)

            if (batch_nr%20 == 0):
                print(f"Processing batch number {batch_nr+1} of {len(train_loader)}")
                print("current loss",loss.item())
                
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * inputs.size(0)
        
        train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device).to(torch.float)
            outputs = model(inputs)
        
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
    
    val_loss /= len(val_loader.dataset)
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    return best_model

In [11]:
LEARNING_RATE = 0.0001
EPOCHS = 3

print(device)
model = ANN(num_classes=2).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Train the model
trained_model = train_model(model, criterion, optimizer, train_loader, val_loader, EPOCHS)

cpu
Starting epoch 1 of 3
Processing batch number 1 of 180
current loss 0.6915819048881531
Processing batch number 21 of 180
current loss 0.6912641525268555
Processing batch number 41 of 180
current loss 0.6910771131515503
Processing batch number 61 of 180
current loss 0.6987403631210327
Processing batch number 81 of 180
current loss 0.6911470293998718
Processing batch number 101 of 180
current loss 0.6876646876335144
Processing batch number 121 of 180
current loss 0.6901980638504028
Processing batch number 141 of 180
current loss 0.6901556253433228
Processing batch number 161 of 180
current loss 0.6938864588737488
Starting epoch 2 of 3
Processing batch number 1 of 180
current loss 0.6878660917282104
Processing batch number 21 of 180
current loss 0.685591459274292
Processing batch number 41 of 180
current loss 0.6769968271255493
Processing batch number 61 of 180
current loss 0.6796138882637024
Processing batch number 81 of 180
current loss 0.6590439677238464
Processing batch number 101

In [13]:
while True:
    inp = input("Leave a review:")
    if inp=="exit":
        break
    tdata = word_vectorizer.transform([" ".join(word_tokenize(inp))])
    tdata = torch.Tensor(tdata.todense())
    tdata = torch.from_numpy(np.array(tdata)).type(torch.FloatTensor)
    print(tdata.shape)
    model.to(device)
    pred = model(tdata).tolist()
    if(pred[0][0]>0.5):
        print("You seem to dislike this thing")
    else:
        print("You seem to like this thing")

torch.Size([1, 7277])
You seem to dislike this thing
torch.Size([1, 7277])
You seem to like this thing
