**Load the required packages and data**

In [1]:
# Load the required packages 
import numpy as np
import csv
import spacy
from collections import Counter
import math

2023-04-08 08:19:17.213100: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-08 08:19:17.213143: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-08 08:19:17.916603: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-08 08:19:36.148680: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-08 08:19:36.162780: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: ca

In [2]:
# Load the data 
with open("data/prototyping_sentences.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        print(row)

['sentence    structure   direct_object   indirect_object']
['The driver gave the bracelet to the grandpa', 'DO', 'bracelet', 'grandpa']
['The driver gave the grandpa the bracelet', 'PO', 'bracelet', 'grandpa']
['The veteran gave the flag to the uncle', 'DO', 'flag', 'uncle']
['The veteran gave the uncle the flag', 'PO', 'flag', 'uncle']
['The guy gave the cotton to the soldier', 'DO', 'cotton', 'soldier']
['The guy gave the soldier the cotton', 'PO', 'cotton', 'soldier']
['The astronaut gave the mustard to the boy', 'DO', 'mustard', 'boy']
['The astronaut gave the boy the mustard', 'PO', 'mustard', 'boy']
['The referee gave the clarinet to the commander', 'DO', 'clarinet', 'commander']
['The referee gave the commander the clarinet', 'PO', 'clarinet', 'commander']
['The teenager gave the pliers to the human', 'DO', 'pliers', 'human']
['The teenager gave the human the pliers', 'PO', 'pliers', 'human']
['The boy gave the loaf to the man', 'DO', 'loaf', 'man']
['The boy gave the man the l

###########################################################################################################

In [3]:
def get_sentence_structure(sentence):
    sentence_structure = None
    assert sentence_structure in {'DO', 'PO', None}
    return sentence_structure

###################################################################################################################

In [4]:
nlp = spacy.load('language_modeling_env/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-3.5.0')

def classify_sentence_structure(sentence):
    doc = nlp(sentence)
    for token in doc:
        if token.dep_ == 'dobj':
            dobj = token
            print(dobj)
        elif token.dep_ == 'dative':
            iobj = token
            print(iobj)
        elif token.dep_ == 'pobj' and token.head.pos_ == 'ADP':
            prep = token.head
            pobj = token
    if 'dobj' in locals() and 'dative' in locals():
        return 'Double Object'
    elif 'dobj' in locals() and 'pobj' in locals():
        return 'Prepositional Object'
    else:
        return 'Unknown'

In [5]:
SENTENCE = 'The bride gave the lever to the fireman'

classify_sentence_structure(SENTENCE)

lever
to


'Prepositional Object'

###################################################################################################################

In [7]:
text = """
The data scientist hurriedly wrote some code on their Linux workstation to get everything completed before the deadline. 
"""

doc = nlp(text)
text

'\nThe data scientist hurriedly wrote some code on their Linux workstation to get everything completed before the deadline. \n'

In [9]:
[token.dep_ for token in doc]

['dep',
 'det',
 'compound',
 'nsubj',
 'advmod',
 'ROOT',
 'det',
 'dobj',
 'prep',
 'poss',
 'compound',
 'pobj',
 'aux',
 'advcl',
 'dobj',
 'acl',
 'prep',
 'det',
 'pobj',
 'punct',
 'dep']

In [10]:
print("Nouns:", [token.lemma_ for token in doc if token.pos_ == "NOUN"])

Nouns: ['data', 'scientist', 'code', 'workstation', 'deadline']


In [11]:
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

Noun phrases: ['\nThe data scientist', 'some code', 'their Linux workstation', 'everything', 'the deadline']


###################################################################################################################

# Task 1

In [12]:
def clf_stc(sentence):
    doc = nlp(sentence)
    #Find the dependency matchers
    tokens_matchers = [matchers.dep_ for matchers in doc]
    
    if 'dative' in tokens_matchers and 'dobj' in tokens_matchers:
        # Now check if it is a PO
        if 'pobj'in tokens_matchers:
            #confirm again 
            if tokens_matchers.index("dobj") < tokens_matchers.index("dative"):
                sentence_structure = "PO"
        else:
            # Check for possible DO
            tokens_matchers.index("dative") < tokens_matchers.index("dobj")
            sentence_structure = "DO"
            
    else:
        sentence_structure =  None
        
    return sentence_structure

In [13]:
# testing 
SENTENCE = 'The bride gave the lever to the fireman'

clf_stc(SENTENCE)

'PO'

In [14]:
# Extract direct objects 
def extract_direct_object(sentence):
    # split the sentence into list
    tokens = sentence.split()
    
    doc = nlp(sentence)
    tokens_matchers = [matchers.dep_ for matchers in doc]
    
    # get the position of the direct object
    dobj_index = tokens_matchers.index("dobj")
    # now find the object
    direct_object = tokens[dobj_index]
    return direct_object

In [15]:
SENTENCE = 'The bride gave the fireman the lever'
extract_direct_object(SENTENCE)

'lever'

In [17]:
# Extract indirect objects 
def extract_indirect_object(sentence):
    doc = nlp(sentence)
    tokens_matchers = [matchers.dep_ for matchers in doc]
    
    # get the sentence structure
    sentence_structure = clf_stc(sentence)
    if sentence_structure == 'DO':
        indobj_index = tokens_matchers.index('dative')
        indirect_object = doc[indobj_index]
    elif sentence_structure == 'PO':
        indobj_index = tokens_matchers.index('pobj')
        indirect_object = doc[indobj_index]
    else:
        pass

    return indirect_object

In [11]:
clf_stc("yeah and she eats people food that's another thing do you do you feed your dog people food or")

'DO'

In [12]:
extract_direct_object("yeah and she eats people food that's another thing do you do you feed your dog people food or")

'food'

In [13]:
extract_indirect_object("yeah and she eats people food that's another thing do you do you feed your dog people food or")

people

In [14]:
doc = nlp("yeah and she eats people food that's another thing do you do you feed your dog people food or")
tokens = [token.dep_ for token in doc]
tokens

['ROOT',
 'cc',
 'nsubj',
 'ccomp',
 'dative',
 'dobj',
 'nsubj',
 'relcl',
 'det',
 'attr',
 'aux',
 'nsubj',
 'aux',
 'nsubj',
 'ROOT',
 'poss',
 'compound',
 'dative',
 'dobj',
 'cc']

# Task 2

In [15]:
import nltk

In [16]:
def extract_feature_1(noun_phrase, sentence):
    return len(nltk.word_tokenize(noun_phrase))



In [17]:
SENTENCE = "The bride gave the fireman the lever"

extract_feature_1("fireman", SENTENCE)

1

In [18]:
def extract_feature_2(noun_phrase, sentence):
    pos_tags = [pos for (word, pos) in nltk.pos_tag(nltk.word_tokenize(noun_phrase))]
    return ' '.join(pos_tags)



In [19]:
extract_feature_2("the fireman", SENTENCE)

'DT NN'

In [20]:
def extract_feature_3(noun_phrase, sentence):
    words = nltk.word_tokenize(noun_phrase)
    word_freqs = Counter(words)
    total_words = len(words)
    log_mean_freq = math.log(sum(word_freqs.values()) / total_words)
    log_median_freq = math.log(sorted(word_freqs.values())[total_words // 2])
    return log_mean_freq



In [21]:
extract_feature_3("fireman", SENTENCE)

0.0

In [22]:
def concatenate_features(features1, features2):
    features = [features1, features2]
    #for i in range(len(features1)):
        #features.append(features1[i] + features2[i])
    return features

In [23]:
features1 = extract_feature_1("the fireman", SENTENCE)
features2 = extract_feature_2("the fireman", SENTENCE)

concatenate_features(features1, features2)

[2, 'DT NN']

# Task 3

In [29]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Define a list of sentences
sentences = ['This is an example sentence', 'Each sentence is converted to an embedding']

# Compute embeddings for all sentences
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)

Sentence: This is an example sentence
Embedding: [-3.93100172e-01  3.88627388e-02  1.98742402e+00 -1.36893511e-01
  1.93089962e-01  3.74967873e-01  1.15455069e-01  3.02821845e-01
  2.32356340e-01 -1.23268202e-01 -2.69240737e-01  4.10017759e-01
 -2.14587912e-01  1.45401850e-01  4.17345434e-01 -2.67233312e-01
 -2.92259753e-01 -1.81809917e-01  9.90740120e-01 -7.87549138e-01
 -7.95889422e-02  7.74835706e-01 -3.67454588e-01 -1.04439938e+00
  3.26537907e-01 -8.63254726e-01  3.20690483e-01 -1.12830329e+00
 -4.59388793e-01 -4.49143909e-02  6.30563051e-02 -6.13953233e-01
  3.75281990e-01 -1.02702759e-01  8.16331059e-02  2.59928375e-01
  4.26196903e-01 -1.09227616e-02  1.49220303e-01  2.61053026e-01
  8.91623557e-01 -5.76651037e-01  9.52781141e-01  1.79338083e-01
 -9.76019919e-01 -6.75556302e-01 -7.54613817e-01  3.20075303e-01
 -3.51041794e-01 -7.56071448e-01 -1.71005225e+00  3.14682156e-01
  3.91978115e-01  7.78529167e-01 -4.78423923e-01  4.90125418e-01
  4.12305444e-01 -1.45893705e+00  2.32474

In [40]:
import nltk
from nltk.corpus import wordnet
import random

def alter_sentence(sentence):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)
    
    # Get the part of speech of each word in the sentence
    pos_tags = nltk.pos_tag(tokens)
    
    # Choose a random word to replace
    word_to_replace = random.choice(pos_tags)[0]
    
    # Get the synset of the word
    synset = wordnet.synsets(word_to_replace)
    
    # Choose a random synset
    random_synset = random.choice(synset)
    
    # Get a random word from the chosen synset
    random_word = random_synset.lemmas()[random.randint(0, len(random_synset.lemmas())-1)].name()
    
    # Replace the chosen word with the random word
    altered_sentence = ' '.join([random_word if word == word_to_replace else word for word in tokens])
    
    return altered_sentence

In [41]:
# Define the original sentence
sentence = "The quick brown fox jumps over the lazy guy."

# Produce an altered version of the sentence
altered_sentence = alter_sentence(sentence)

# Print the original and altered sentences
print("Original sentence:", sentence)
print("Altered sentence:", altered_sentence)

Original sentence: The quick brown fox jumps over the lazy guy.
Altered sentence: The quick brown fox jumps over the lazy guy .
