In [0]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
import sklearn as sk
import glob
import dill

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from nltk import word_tokenize
import nltk
nltk.download('punkt')
import re

import spacy
nlp = spacy.load('en')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
PATH = '/content/drive/My Drive/NNFL/'

In [0]:
TEXT = torch.load(PATH+'TEXT.pt',pickle_module=dill)
txt_field_AGR = torch.load(PATH+'Personality/AGR_txt.pt',pickle_module = dill)
txt_field_CON = torch.load(PATH+'Personality/CON_txt.pt',pickle_module = dill)
txt_field_EXT = torch.load(PATH+'Personality/EXT_txt.pt',pickle_module = dill)
txt_field_NEU = torch.load(PATH+'Personality/NEU_txt.pt',pickle_module = dill)
txt_field_OPN = torch.load(PATH+'Personality/OPN_txt.pt',pickle_module = dill)

LABEL = data.LabelField(dtype = torch.float)

In [0]:
class sentimentCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = embedding_dim, out_channels = 150, kernel_size = fs)for fs in [4,5]])
        self.conv2 = nn.Conv1d(in_channels = 1, out_channels = 100, kernel_size = 3)              
        self.fc1 = nn.Linear(200, 100) 
        self.fc2=nn.Linear(100,1)
        
    def forward(self, text):

        embedded = self.embedding(text)        
        embedded = embedded.permute(0, 2, 1)

        sent_len=embedded.size(2)
        padding=3000-sent_len
        batch_size=embedded.size(0)
        torch_padding=torch.zeros(batch_size,300,padding,dtype = embedded.dtype,device = embedded.device)
        lz=[embedded,torch_padding]
        zcat = torch.cat(lz, dim = 2)

        conved = [F.relu(conv(zcat)) for conv in self.convs]

        pooled=[]
        for c in conved:
          pooled.append(F.max_pool1d(c,c.shape[2]))

        pooled = [f.permute(0,2,1) for f in pooled]

        pooled2 = [F.max_pool1d(p, 2) for p in pooled]

        pooled3 = [F.relu(self.conv2(p1)) for p1 in pooled2]

        pooled4=[]
        for c in pooled3:
            pooled4.append(F.max_pool1d(c,c.shape[2]))

        final = torch.cat(pooled4,dim = 1)
        final = final.reshape(batch_size,200)
        full1 = self.fc1(final)
        full2= self.fc2(full1)
        return full2

class PersonalityCNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_dim,padding_idx = pad_idx)
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = embedding_dim,out_channels = 80,kernel_size = fs) for fs in (3,4,5)])
        self.conv2 = nn.Conv1d(in_channels = 1,out_channels = 100,kernel_size = (2))
        self.fc1 = nn.Linear(300,80)
        self.fc2 = nn.Linear(80,1)
        self.dropout = nn.Dropout(0.5)
    def forward(self,text):
        embedded = self.embedding(text.T)
        embedded = embedded.permute(0,2,1)
        x=embedded.size(2)
        y=3000-x
        batch_size=embedded.size(0)
        z = torch.zeros(batch_size,300,y,dtype = embedded.dtype,device = embedded.device)
        lz=[embedded,z]
        zcat = torch.cat(lz, dim = 2,)
        conved = [F.relu(conv(zcat)) for conv in self.convs]
        pooled2 = []
        for c in conved:
          pooled2.append(F.max_pool1d(c,c.shape[2]))
        pooled2 = [f.permute(0,2,1) for f in pooled2]
        pooled = [F.max_pool1d(conv,(2)) for conv in pooled2] #25

        pooled2 = [F.relu(self.conv2(p1)) for p1 in pooled]
        pooled3 = []
        for c in pooled2:
          pooled3.append(F.max_pool1d(c,c.shape[2]))
        final = torch.cat(pooled3,dim = 1)
        final = final.reshape(batch_size,300)
        full1 = self.fc1(final)
        full2 = self.fc2(full1)
        return full2


In [0]:
sentimentModel = sentimentCNN(10002, 300, 0.5, 1)
sentimentModel = sentimentModel.to(device)

input_dim_agr = len(txt_field_AGR.vocab)
embedding_dim = 300
pad_idx = txt_field_AGR.vocab.stoi[txt_field_AGR.pad_token]
modelAGR = PersonalityCNN(input_dim_agr,embedding_dim,pad_idx)
modelAGR.to(device)

input_dim_con = len(txt_field_CON.vocab)
embedding_dim = 300
pad_idx = txt_field_CON.vocab.stoi[txt_field_CON.pad_token]
modelCON = PersonalityCNN(input_dim_con,embedding_dim,pad_idx)
modelCON.to(device)

input_dim_ext = len(txt_field_EXT.vocab)
embedding_dim = 300
pad_idx = txt_field_EXT.vocab.stoi[txt_field_EXT.pad_token]
modelEXT = PersonalityCNN(input_dim_ext,embedding_dim,pad_idx)
modelEXT.to(device)

input_dim_neu = len(txt_field_NEU.vocab)
embedding_dim = 300
pad_idx = txt_field_NEU.vocab.stoi[txt_field_NEU.pad_token]
modelNEU = PersonalityCNN(input_dim_neu,embedding_dim,pad_idx)
modelNEU.to(device)

input_dim_opn = len(txt_field_OPN.vocab)
embedding_dim = 300
pad_idx = txt_field_OPN.vocab.stoi[txt_field_OPN.pad_token]
modelOPN = PersonalityCNN(input_dim_opn,embedding_dim,pad_idx)
modelOPN.to(device)



PersonalityCNN(
  (embedding): Embedding(10002, 300, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(300, 80, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 80, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 80, kernel_size=(5,), stride=(1,))
  )
  (conv2): Conv1d(1, 100, kernel_size=(2,), stride=(1,))
  (fc1): Linear(in_features=300, out_features=80, bias=True)
  (fc2): Linear(in_features=80, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [0]:
#This extracts the layers from the pre-trained model
sentimentActivation = {}
def get_sentiactivation(name):
    def hook(model, input, output):
        sentimentActivation[name] = output.detach()
    return hook

for name, layer in sentimentModel.named_modules():
    layer.register_forward_hook(get_sentiactivation(name))

AGRActivation = {}
def get_AGRactivation(name):
    def hook(model, input, output):
        AGRActivation[name] = output.detach()
    return hook

for name, layer in modelAGR.named_modules():
    print(layer)
    layer.register_forward_hook(get_AGRactivation(name))

CONActivation = {}
def get_CONactivation(name):
    def hook(model, input, output):
        CONActivation[name] = output.detach()
    return hook

for name, layer in modelCON.named_modules():
    print(layer)
    layer.register_forward_hook(get_CONactivation(name))

EXTActivation = {}
def get_EXTactivation(name):
    def hook(model, input, output):
        EXTActivation[name] = output.detach()
    return hook

for name, layer in modelEXT.named_modules():
    print(layer)
    layer.register_forward_hook(get_EXTactivation(name))

NEUActivation = {}
def get_NEUactivation(name):
    def hook(model, input, output):
        NEUActivation[name] = output.detach()
    return hook

for name, layer in modelNEU.named_modules():
    print(layer)
    layer.register_forward_hook(get_NEUactivation(name))

OPNActivation = {}
def get_OPNactivation(name):
    def hook(model, input, output):
        OPNActivation[name] = output.detach()
    return hook

for name, layer in modelOPN.named_modules():
    print(layer)
    layer.register_forward_hook(get_OPNactivation(name))

PersonalityCNN(
  (embedding): Embedding(10002, 300, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(300, 80, kernel_size=(3,), stride=(1,))
    (1): Conv1d(300, 80, kernel_size=(4,), stride=(1,))
    (2): Conv1d(300, 80, kernel_size=(5,), stride=(1,))
  )
  (conv2): Conv1d(1, 100, kernel_size=(2,), stride=(1,))
  (fc1): Linear(in_features=300, out_features=80, bias=True)
  (fc2): Linear(in_features=80, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
Embedding(10002, 300, padding_idx=1)
ModuleList(
  (0): Conv1d(300, 80, kernel_size=(3,), stride=(1,))
  (1): Conv1d(300, 80, kernel_size=(4,), stride=(1,))
  (2): Conv1d(300, 80, kernel_size=(5,), stride=(1,))
)
Conv1d(300, 80, kernel_size=(3,), stride=(1,))
Conv1d(300, 80, kernel_size=(4,), stride=(1,))
Conv1d(300, 80, kernel_size=(5,), stride=(1,))
Conv1d(1, 100, kernel_size=(2,), stride=(1,))
Linear(in_features=300, out_features=80, bias=True)
Linear(in_features=80, out_features=1, bias=True)
Dropout(p=0.5

In [0]:
def extract_sentimentfeatures(model,sentence,min_len=5):
  model.eval()
  tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
  if len(tokenized)>3000:
    tokenized=tokenized[:3000]
  if len(tokenized) < min_len:
      tokenized += ['<pad>'] * (min_len - len(tokenized))
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  model(tensor)
  return sentimentActivation['fc1']

def tokenize(s):
    return re.findall(r"[\w']+|[.,!?;]",s)
    
def breakEssay(essay):
  l=[]
  curr=""
  words=word_tokenize(essay)
  ct=0
  for word in words:
      ct+=1
      if word=="." or word=="!" or word=="?" or ct%50==0:
          if len(curr)==0:
              curr=""
              ct=0
              continue
          curr+=word
          l.append(curr)
          curr=""
          ct=0
          continue
      curr+=word+" "
  if len(l) == 0:
    l.append(essay)
  return l

def extract_personalityfeatures(text,model,txt,keydict,min_len=5):
  model.eval()
  tokenized = [tok for tok in tokenize(text)]
  if len(tokenized)>3000:
    tokenized=tokenized[:3000]
  if len(tokenized) < min_len:
      tokenized += ['<pad>'] * (min_len - len(tokenized))
  indexed = [txt.vocab.stoi[t] for t in tokenized]
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(0)
  tensor = tensor.T
  model(tensor)
  return keydict['fc1']

In [0]:
#All the models are loaded here
sentimentModel.load_state_dict(torch.load(PATH+'sentimentModel.pt',pickle_module=dill))
modelAGR.load_state_dict(torch.load(PATH+'Personality/AGR.pt',pickle_module = dill))
modelCON.load_state_dict(torch.load(PATH+'Personality/CON.pt',pickle_module = dill))
modelEXT.load_state_dict(torch.load(PATH+'Personality/EXT.pt',pickle_module = dill))
modelNEU.load_state_dict(torch.load(PATH+'Personality/NEU.pt',pickle_module = dill))
modelOPN.load_state_dict(torch.load(PATH+'Personality/OPN.pt',pickle_module = dill))

<All keys matched successfully>

In [0]:
#This loads the Amazon Reviews
df = pd.read_csv(PATH+'reviews_w_stars.csv')

In [0]:
#This extracts the Sentiment and Personality features from the Amazon Reviews
feat = []
for i in range(df.shape[0]):
    temp = []
    x1 = extract_sentimentfeatures(sentimentModel, str(df.iloc[i]['Reviews'])).cpu().numpy()
    x2 = extract_personalityfeatures(str(df.iloc[i]['Reviews']),modelAGR,txt_field_AGR,AGRActivation).cpu().numpy()
    x3 = extract_personalityfeatures(str(df.iloc[i]['Reviews']),modelCON,txt_field_CON,CONActivation).cpu().numpy()
    x4 = extract_personalityfeatures(str(df.iloc[i]['Reviews']),modelEXT,txt_field_EXT,EXTActivation).cpu().numpy()
    x5 = extract_personalityfeatures(str(df.iloc[i]['Reviews']),modelNEU,txt_field_NEU,NEUActivation).cpu().numpy()
    x6 = extract_personalityfeatures(str(df.iloc[i]['Reviews']),modelOPN,txt_field_OPN,OPNActivation).cpu().numpy()
    temp = np.concatenate((x1,x2,x3,x4,x5,x6), axis = 1)
    feat.append(temp)

In [0]:
#This restructures the features extracted and appends the relevant data
featdf = pd.DataFrame(np.array(feat).reshape(-1, 500))
featdf[500] = df['Stars']
featdf[501] = df['Type']

           0         1         2         3    ...       498       499  500  501
0     0.208791  1.040853 -1.041554 -1.404897  ... -0.269785 -0.034902    1    1
1    -0.409257 -4.044721 -1.625169  2.561371  ... -0.498094  0.863928    1    1
2     0.127271 -3.306749 -1.006750  1.578318  ... -0.649528  0.135815    1    1
3     0.288411 -2.402842 -1.291371  0.591093  ...  0.063336 -0.097582    1    1
4    -0.068595 -0.579789 -0.773463 -0.013373  ... -0.388713  0.233158    5    1
...        ...       ...       ...       ...  ...       ...       ...  ...  ...
1249  0.252947  3.535869 -0.279142 -2.632548  ... -0.208582  0.472243    4    0
1250  0.241757  2.252195 -1.353155 -2.207019  ... -0.157103  0.509930    5    0
1251  0.813260  8.406722 -1.591205 -6.307390  ... -0.449360  0.632835    5    0
1252  0.345078  7.167039 -1.121790 -5.279661  ... -0.445722  0.585099    5    0
1253  0.069935  2.689093 -0.344016 -1.276075  ... -0.421279  0.679173    5    0

[1254 rows x 502 columns]


In [0]:
#This is the SVM
X = featdf.iloc[:,:501]
y = featdf[501]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
clf = SVC(gamma = 'scale')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test,y_pred))
print('Accuracy:',accuracy_score(y_test, y_pred))

Confusion Matrix:
[[73  5]
 [10 38]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91        78
           1       0.88      0.79      0.84        48

    accuracy                           0.88       126
   macro avg       0.88      0.86      0.87       126
weighted avg       0.88      0.88      0.88       126

Accuracy: 0.8809523809523809
