# Imports & Configuration

In [None]:
!pip install wordsegment
!pip install num2words
!pip install skorch

import os
import string
import nltk
import torch
import re
import spacy
import time
import torch
import torch.nn as nn
import random
import numpy as np
import seaborn as sns
import pandas as pd
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import wordsegment
import sklearn
from google.colab import drive
from imblearn.over_sampling import SMOTE
from skorch.callbacks import Callback
from imblearn.over_sampling import ADASYN
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.optim import SGD
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from skorch import NeuralNetClassifier
from tqdm import tqdm_notebook
from skorch.callbacks import EpochScoring
from num2words import num2words
from sklearn.metrics import f1_score, recall_score, precision_score, make_scorer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from scipy.stats import randint
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from skorch.callbacks import EarlyStopping
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

%matplotlib inline

nltk.download("all")
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

pd.set_option('display.max_colwidth', None)

In [2]:
# setting the seed for reproducability
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

set_seed(42)

In [3]:
# switch to gpu if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [4]:
# connect to the drive, remove if used on local device
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

## Loading Data

In [6]:
# Loading communication data, change this path when using this code on your local device
df = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/communication_data.xlsm')

## Cleaning Data

In [7]:
# lowercasing
df['Content'] = df['Content'].apply(lambda word: " ".join(word.lower() for word in word.split()))

In [8]:
# concatenating and reorganizing data
df = df.groupby('NegotiationID').agg({
    'NegoOutcome': 'first',
    'Content': lambda x: ' '.join(x)
}).reset_index()

In [9]:
# Tokenization
spacy.require_gpu()
nlp = spacy.load("en_core_web_sm")

# splitting words connected by punctuation
split_punctuation = string.punctuation + '€'
split_punctuation_pattern = r'\w*(?:['+split_punctuation+']+\w*)+'
split_appended_pattern = pattern = r'(?<=[a-z])(?=[A-Z])'

# converting numberes to text form
number_pattern = r'\d+'

# segmenting words that have been accidentally written together
wordsegment.load()

def split_connected_words(text):
    split_text = re.sub(split_punctuation_pattern, lambda x: re.sub(r'['+split_punctuation+']+', lambda y: ' ' + y.group(0) + ' ', x.group(0)), text)
    return split_text

def convert_numeric(text):
    converted_text = re.sub(number_pattern, lambda x: num2words(int(x.group(0))), text)
    return converted_text

def word_segment(token_list):
    segmented_words = []
    for word in token_list:
        segmented_words.extend(wordsegment.segment(word))
    return segmented_words

def tokenize_row(text):
    return word_tokenize(text)

# applying changes and tokenizing
df['Content'] = df['Content'].apply(split_connected_words)
df['Content'] = df['Content'].apply(convert_numeric)
df['Content'] = df['Content'].apply(tokenize_row)
df['Content'] = df['Content'].apply(word_segment)

In [10]:
# Normalization

# removing defined punctuation
exclude_punctuation = '!?$%'
custom_punctuation = ''.join([char for char in string.punctuation if char not in exclude_punctuation])

# removing stopwords
stop_words = set(stopwords.words('english'))

def remove_punctuation(tokens):
    return [token for token in tokens if token not in custom_punctuation]

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# applying changes
df['Content'] = df['Content'].apply(remove_punctuation)
df['Content'] = df['Content'].apply(remove_stopwords)

In [11]:
# lemmatization
spacy.require_gpu()
spacy_lemmatizer = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def lemmatize_row(tokens):
    return [token.lemma_ for token in nlp(" ".join(tokens))]


df['Content'] = df['Content'].apply(lemmatize_row)

## Creating Embeddings

In [12]:
# storing all sample labels in ordered array
labels = []
for index, row in df.iterrows():
    if 'FinalAccept' in str(row.values):
        labels.append(0)
    else:
        labels.append(1)

# storing all tokenized samples in ordered array, building vocabulary and finding maximum sentence length
tokenized_samples = []
word2index = {}
max_length = 0

# adding indexes for padding and unknown tokens for vocabulary
word2index['<pad>'] = 0
word2index['<unk>'] = 1

index = 2
for row in range(len(df)):
    tokenized_sample = df.iloc[row]['Content']
    tokenized_samples.append(tokenized_sample)

    # add new tokens to vocabulary
    for token in tokenized_sample:
        if token not in word2index:
            word2index[token] = index
            index += 1
    max_length = max(max_length, len(tokenized_sample))

In [13]:
# padding each sequence to the maximum sentence length
# storing all tokenized samples in an ordered array, with indixes as tokens
indexed_samples = []
for tokenized_sample in tokenized_samples:

    tokenized_sample += ['<pad>'] * (max_length - len(tokenized_sample))

    indexed_sample = [word2index.get(token) for token in tokenized_sample]
    indexed_samples.append(indexed_sample)

In [14]:
# loading pretrained vectors and creating embeddings with FastText
def load_pretrained_vectors(word2index, vectorfile, target_dim):
    file = open(vectorfile, 'r', encoding='utf-8', newline="\n", errors="ignore")
    n, d = map(int, file.readline().split())

    # initializing random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2index), d))
    embeddings[word2index['<pad>']] = np.zeros((d,))

    words_found = []

    # loading pretrained vectors
    count = 0
    for line in file:
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2index:
            count += 1
            embeddings[word2index[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2index)} pretrained vectors found.")

    return embeddings

target_dim = 300
embeddings = load_pretrained_vectors(word2index, '/content/drive/MyDrive/Colab Notebooks/crawl-300d-2M-subword.vec', target_dim)
embeddings = torch.tensor(embeddings)


''' Use this for Word2Vec instead
def load_pretrained_vectors(word2index, model_file, target_dim):
    # Load the Word2Vec model
    word2vec_model = KeyedVectors.load_word2vec_format(model_file, binary=True)

    # Get the dimension of the embeddings
    d = word2vec_model.vector_size

    # Initialize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2index), d))
    embeddings[word2index['<pad>']] = np.zeros((d,))

    words_found = []

    # Load pretrained vectors
    count = 0
    for word, index in word2index.items():
        if word in word2vec_model:
            count += 1
            embeddings[index] = word2vec_model[word]

    print(f"There are {count} / {len(word2index)} pretrained vectors found.")

    return embeddings

target_dim = 300
embeddings = load_pretrained_vectors(word2index, '/content/drive/MyDrive/Colab Notebooks/GoogleNews-vectors-negative300.bin', target_dim)
embeddings = torch.tensor(embeddings)
'''

There are 9936 / 10195 pretrained vectors found.


' Use this for Word2Vec instead\ndef load_pretrained_vectors(word2index, model_file, target_dim):\n    # Load the Word2Vec model\n    word2vec_model = KeyedVectors.load_word2vec_format(model_file, binary=True)\n\n    # Get the dimension of the embeddings\n    d = word2vec_model.vector_size\n\n    # Initialize random embeddings\n    embeddings = np.random.uniform(-0.25, 0.25, (len(word2index), d))\n    embeddings[word2index[\'<pad>\']] = np.zeros((d,))\n\n    words_found = []\n\n    # Load pretrained vectors\n    count = 0\n    for word, index in word2index.items():\n        if word in word2vec_model:\n            count += 1\n            embeddings[index] = word2vec_model[word]\n\n    print(f"There are {count} / {len(word2index)} pretrained vectors found.")\n\n    return embeddings\n\ntarget_dim = 300\nembeddings = load_pretrained_vectors(word2index, \'/content/drive/MyDrive/Colab Notebooks/GoogleNews-vectors-negative300.bin\', target_dim)\nembeddings = torch.tensor(embeddings)\n'




# Long Short Term Memory Network

## Hyperparameters

In [15]:
# Initialization of Hyperparameters.
# These parameteres are only for initialization for sklearn
# The model will refer to the defined search space in sklearn for training

embed_dim = 300
max_norm = 5
freeze_embeddings = False
vocab_size = None

# Architecture
dropout_rate = 0.5

# Optimizer
learning_rate = 0.25
rho = 0.9

# Activation function
activation_function = F.relu

# Loss function

# Calculate weights inversely proportional to class frequencies
positive_samples = 518
negative_samples = 105
total_samples = positive_samples + negative_samples

weight_negative = total_samples / (2 * negative_samples)
weight_positive = total_samples / (2 * positive_samples)
class_weights = torch.tensor([weight_negative, weight_positive], dtype=torch.float)
class_weights.to(device)

# Pooling function
pooling_function = F.max_pool1d

# lstm parameters
dimension = 128
num_layers = 1

## Model Definition

In [16]:
# Defining LSTM for Text Classification
class LSTM(nn.Module):

    def __init__(self,
                 pretrained_embedding,
                 freeze_embedding,
                 vocab_size,
                 dropout,
                 max_norm,
                 embed_dim,
                 num_classes,
                 num_layers,
                 dimension):

        super(LSTM, self).__init__()


        # defining embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                         freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                         embedding_dim=self.embed_dim,
                                         padding_idx=0,
                                         max_norm=self.max_norm)

        self.dimension = dimension

        # defining nn architecture
        self.lstm = nn.LSTM(input_size=self.embed_dim,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=False)

        self.drop = nn.Dropout(p=dropout)

        self.fc = nn.Linear(dimension, num_classes)


    def forward(self, indexed_samples):

        x_embed=None

        # get embeddings from input_ids
        # Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(indexed_samples).float()

        # receive output
        # ouput shape: (b, max_length, hidden_size)
        output, _ = self.lstm(x_embed)
        last_hidden_state = output[:, -1, :]

        # compute logits
        logits = self.fc(self.drop(last_hidden_state))

        return logits

## Executing, Tuning, and Validating

In [17]:
# initialize model, these parameters will not affect the hyperoptimization loop, sklearn needs an initialized model.
lstm = LSTM(embeddings, freeze_embeddings, vocab_size, dropout_rate, max_norm, embed_dim, 2, num_layers, dimension)

# wrap model in sklearn wrapper for hyperparameter tuning
wrappedModel = NeuralNetClassifier(
    lstm,
    criterion=nn.CrossEntropyLoss(weight=class_weights),
    optimizer= optim.Adadelta,
    max_epochs=20,
    lr=0.1,
    device= device,
    callbacks=[
    EarlyStopping(patience=10, monitor='valid_loss', lower_is_better=True)
    ]
)

# define hyperparameter searchspace, adjust at your will
param_grid = {
    'lr':[0.0001,0.001, 0.01],
    'batch_size':[16,32,64],
    'module__dropout':[0,0.3,0.5],
    'module__pretrained_embedding':[embeddings],
    'module__freeze_embedding':[freeze_embeddings],
    'module__vocab_size':[vocab_size],
    'module__max_norm':[10],
    'module__embed_dim':[embed_dim],
    'module__num_classes':[2],
    'module__num_layers':[num_layers],
    'module__dimension': [32, 64, 128, 256, 512]
}

# define scorers for model results
precision_scorer = make_scorer(precision_score)
recall_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
accuracy_scorer = make_scorer(accuracy_score)

# Define Cross Fold Validation, adjust the number in the next block if changed
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define Random Search and Number of Iterations
random_search = RandomizedSearchCV(wrappedModel, param_grid, cv=cv, scoring={'precision': precision_scorer,'recall': recall_scorer,'f1': f1_scorer, 'accuracy':accuracy_scorer}, refit='f1', verbose=1, n_iter=1)

In [None]:
# perform automatic hyperparameter tuning with RandomSearchCV and print the results
random_search.fit(np.asarray(indexed_samples), np.asarray(labels))

best_score = random_search.best_score_
print("average f1 score of best best configuration over all of its folds: ", best_score)

# Access the average precision from the results
average_precision = random_search.cv_results_['mean_test_precision'][random_search.best_index_]
print("Average Precision for the best hyperparameter selection: ", average_precision)

# Access the average precision from the results
average_recall = random_search.cv_results_['mean_test_recall'][random_search.best_index_]
print("Average Recall for the best hyperparameter selection: ", average_recall)

# Access the average precision from the results
average_accuracy = random_search.cv_results_['mean_test_accuracy'][random_search.best_index_]
print("Average accuracy for the best hyperparameter selection: ", average_accuracy)

# Assuming 10 folds, adjust the number accordingly
f1_scores_for_folds = [random_search.cv_results_[f'split{i}_test_f1'][random_search.best_index_] for i in range(10)]

# Print the F1 scores for each fold
totalscore=0
for fold, f1_score in enumerate(f1_scores_for_folds):
    totalscore+=f1_score
    print(f"Fold {fold + 1} F1 Score: {f1_score}")

print(totalscore/10)

best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
# Save the model if wanted
best_model_crossfold = random_search.best_estimator_

torch.save(best_model_crossfold, '/content/drive/MyDrive/Colab Notebooks/YOUR_MODEL_NAME.pth')