# Lexical Semantics

### 1. Sentence segmentation on the files that occur in a directory

In [None]:
 with open('../07_linguistic_preprocessing/webnlg-test.txt') as infile:
            text = infile.read()
            
import nltk
from nltk.tokenize import sent_tokenize

corpus = sent_tokenize(text)
corpus

### Create a vector  with 10 dimensions , all of value 0

In [None]:
import numpy as np
vector = np.zeros(10)
print(vector)

In [None]:
import pandas as pd
vocab = ["a","b"]
print(vocab)
vocab_len = len(vocab)
df = pd.DataFrame(data=np.zeros((vocab_len, vocab_len)), dtype=np.int16,index=vocab,columns=vocab)
df.head()

### Creating a dictionary using defaultdict

This avoids having python throw a KeyError when you try to get an item with a key that is not currently in the dictionary. 

In [None]:
from collections import defaultdict

d = defaultdict(int)
print(d.items())

d["the"] += 1
print(d.items())

### Create a dictionary mapping each token in a corpus to a distinct integer

In [None]:
import collections
texts = ["john runs.", "Mary swims"]

# Create an empty dictionary using collections.defaultdic method
# Set the default value that will be assigned to each token to the current size of the vocabulary
token2int = collections.defaultdict(lambda: len(token2int)) 

# Set the value of <eos> to 0
token2int['<eos>'] = 0

# Add each new word in "texts" to the dictionary
# and map it to the integer corresponding to its first position in the text (= the size of
# the vocabulary present in the dictionary at that time)
for text in texts:
    [token2int[token] for token in text.split()]
    
token2int

### Creating, visualising and inspecting a document-term matrix

In [None]:
docs = ["Shakespeare wrote plays","Shakespeare wrote poems",
        "Hugo wrote novels","Verne wrote novels"
        "Rimbaud wrote poems",
        "John read science", 
        "Peter read books"]

from sklearn.feature_extraction.text import CountVectorizer

# Create a frequency vectorizer object
# using  the option "stop_words = 'english'" ensures that  stop_words are removed
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words = 'english') 

## Create a document-term matrix from a list of strings
doc_term_matrix = vectorizer.fit_transform(docs)

# Print out the document / token matrix
# use the todense() attribute to create the matrix view
print(doc_term_matrix.todense()) 

In [None]:
# show doc-term matrix (rows are documents, columns are terms)
print(doc_term_matrix.toarray())

In [None]:
# get the vocabulary extracted by fit_transform
vectorizer.get_feature_names()

In [None]:
# get token-to-int dictionary
vectorizer.vocabulary_

In [None]:
# get the index of a token
vectorizer.vocabulary_['hugo']

### Create document-token matrix for a larger corpus

In [None]:
with open('../07_linguistic_preprocessing/webnlg-test.txt') as infile:
            text = infile.read()
            
import nltk
from nltk.tokenize import sent_tokenize

corpus = sent_tokenize(text)
#corpus

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a frequency vectorizer object
# using  the option "stop_words = 'english'" ensures that stop_words are removed
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words = 'english') 

# Convert documents to document/token matrix by applying the vectoriser
doc_token_matrix = vectorizer.fit_transform(corpus)

# Print out the document / token matrix
# use the todense() attribute to create the matrix view
print(doc_token_matrix.todense()) 

In [None]:
# Get number of rows and columns
# because it is a document/token matrix, the shape is (nb of docs, nb of tokens)
doc_token_matrix.shape

In [None]:
vectorizer.vocabulary_['mata']

### Applying SVD decomposition to a matrix

[svd method](https://numpy.org/doc/stable/reference/generated/numpy.linalg.svd.html)

In [None]:
# Create the token co-occurence matrix 
token_cooc_matrix = (doc_token_matrix.T * doc_token_matrix) 
# Set the diagonal to 0  (else it will indicate the token count)
token_cooc_matrix.setdiag(0) 
# Print out the token co-occurence matrix
print(token_cooc_matrix.todense()) # print out matrix in dense format

In [None]:
# The token cooccurences matrix is square, its shape is (nb of tokens, nb of tokens)
token_cooc_matrix.shape

In [None]:
import numpy as np
import math
# matrix is a word co-occurence matrix
# On large corpora, make sure to use the full_matrices=False (reduced SVD) option
# else processing will be very slow
matrix = token_cooc_matrix.todense()
U, S, Vt = np.linalg.svd(matrix,full_matrices=False)
# Keep only the first 50 dimensions of U as word vectors
U = U[:,:50]

### Get vector for word 'mata'

In [None]:
print(U[vectorizer.vocabulary_['mata']])

### Storing the word cooccurence matrix in a pandas dataframe

In [None]:
import pandas as pd
# get the tokens
names = vectorizer.get_feature_names() 
# create a pandas frame whose content is the token co-occurence matrix
# and whose row and column headers are the tokens
# Note that the matrix input to Pandas must be in dense format
df = pd.DataFrame(data = token_cooc_matrix.todense(), columns = names, index = names)
df.head()

In [None]:
# Printing out a word vector
# Define a dictionary mapping tokens to indice
vocab_size = df.shape[0]
word2index = dict(zip(df.index,range(vocab_size)))


In [None]:
# Print out the vector for "victory"
print(U[word2index['mata']])

**Create a dictionary mapping tokens to integer** 

In [None]:
tokens = ["a","b"]

dict(zip(tokens,range(len(tokens))))