In [1]:
# imports and config
import pandas as pd
import string
import nltk
import re
from num2words import num2words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darko\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# loading the data
df = pd.read_excel('communication_data.xlsm')

In [3]:
# lowercasing
df['Content'] = df['Content'].apply(lambda word: " ".join(word.lower() for word in word.split()))

In [4]:
# concatenating and reorganizing data
df = df.groupby('NegotiationID').agg({
    'NegoOutcome': 'first',
    'Content': lambda x: ' '.join(x[:-1])
}).reset_index()

In [5]:
# tokenization

# regex pattern to split words connected by punctuation
split_punctuation = string.punctuation + '€'
split_punctuation_pattern = r'\w*(?:['+split_punctuation+']+\w*)+'
number_pattern = r'\d+'

def split_connected_words(text):
    split_text = re.sub(split_punctuation_pattern, lambda x: re.sub(r'['+split_punctuation+']+', lambda y: ' ' + y.group(0) + ' ', x.group(0)), text)
    return split_text

def convert_numeric(text):
    converted_text = re.sub(number_pattern, lambda  x: num2words(int(x.group(0))), text)
    return converted_text
    
def tokenize_row(text):
    return word_tokenize(text)

df['Content'] = df['Content'].apply(split_connected_words)
df['Content'] = df['Content'].apply(convert_numeric)
df['tokenized_content'] = df['Content'].apply(tokenize_row)

In [6]:
# normalization

# removing punctuation
exclude_punctuation = '!?$%'
custom_punctuation = ''.join([char for char in string.punctuation if char not in exclude_punctuation])

def remove_punctuation(tokens):
    return [token for token in tokens if token not in custom_punctuation]

df['tokenized_content'] = df['tokenized_content'].apply(remove_punctuation)

# removing stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

df['tokenized_content'] = df['tokenized_content'].apply(remove_stopwords)


# convert numeric characters to text
def convert_numeric(tokens):
    updated_tokens = []
    for token in tokens:
        if token.isdigit():
            written_word = num2words(int(token))
            updated_tokens.append(written_word)
        else:
            updated_tokens.append(token)
    return updated_tokens

df['tokenized_content'] = df['tokenized_content'].apply(convert_numeric)

In [7]:
# lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_row(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['tokenized_content'] = df['tokenized_content'].apply(lemmatize_row)

In [11]:
print(df)

     NegotiationID  NegoOutcome  \
0               55  FinalAccept   
1               61  FinalAccept   
2               62  FinalReject   
3               63  FinalReject   
4               64  FinalAccept   
..             ...          ...   
618          68196  FinalAccept   
619          68224  FinalAccept   
620          68252  FinalAccept   
621          68280  FinalAccept   
622          68308  FinalAccept   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [None]:
# take tokenized texts, build vocabulary and find maximum sentence length
max_length = 0
tokenized_texts = []
word2index = {}

# add indexes for padding and unknown tokens
word2index['<pad>'] = 0
word2index['<unk>'] = 1

# build vocab from corpus starting from index 2
index = 2
for row in range(len(df)):
    # store every sample in an array
    tokenized_sample = df.iloc[i]['tokenized_content']
    tokenized_texts.append(tokenized_sample)
    
    # add new tokens to vocabulary
    for token in tokenized_sample:
        if token not in word2index:
            word2index[token] = index
            index += 1
    max_length = max(max_length, len(tokenized_sample))
    
    



    


In [8]:
# load word embeddings
word2vec_path = 'GoogleNews-vectors-negative300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [9]:
# mapping tokens to word embeddings
embedding_dimension = 300

def map_token_to_embedding(tokens):
    token_to_embedding = {}
    
    for token in tokens:
        if token in word2vec_model:
            token_to_embedding[token] = word2vec_model[token]
        else:
            token_to_embedding[token] = [0.0] * embedding_dimension
    return token_to_embedding

df['token_to_embedding'] = df['tokenized_content'].apply(map_token_to_embedding)