## Exercise 6 - Vector space model

First name: Brian
<br>
Last name: Schweigler
<br>
Matriculation number: 16-102-071


##### Q1: Represent each play by a vector with only the tf component. You can apply some preprocessing before generating this vector representation.
General imports and setup:

In [37]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
import lxml.etree
import os
from scipy import stats
import nltk
import nltk.tokenize
import collections

np.random.seed(6)  # for reproducibility

PUNCT_RE = re.compile(r'[^\w\s]+$')


def is_punct(string):
    """Check if STRING is a punctuation marker or a sequence of
       punctuation markers.
    """
    return PUNCT_RE.match(string) is not None


def preprocess_text(text, language='French', lowercase=True):
    if lowercase:
        text = text.lower()
    if language == 'French':
        text = re.sub("-", " ", text)
        text = re.sub("l'", "le ", text)
        text = re.sub("d'", "de ", text)
        text = re.sub("c'", "ce ", text)
        text = re.sub("j'", "je ", text)
        text = re.sub("m'", "me ", text)
        text = re.sub("qu'", "que ", text)
        text = re.sub("'", " ' ", text)
        text = re.sub("quelqu'", "quelque ", text)
        text = re.sub("aujourd'hui", "aujourdhui", text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    tokens = [token for token in tokens if not is_punct(token)]
    return tokens


subgenres = ('Comédie', 'Tragédie', 'Tragi-comédie')
plays, titles, genres = [], [], []
authors, years = [], []

for fn in os.scandir('theatre-classique'):
    # Only include XML files
    if not fn.name.endswith('.xml'):
        continue
    tree = lxml.etree.parse(fn.path)
    genre = tree.find('//genre')
    title = tree.find('//title')
    author = tree.find('//author')
    year = tree.find('//date')
    if genre is not None and genre.text in subgenres:
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays.append(text)
        genres.append(genre.text)
        titles.append(title.text)
        authors.append(author.text)
        if year is not None:
            years.append(year.text)

print(len(plays), len(genres), len(titles), len(authors), len(years))



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
498 498 498 498 208


In [38]:
#nltk.download('punkt')
plays_tok = [preprocess_text(play, 'French') for play in plays]

def extract_vocabulary(tokenized_corpus, min_count=1, max_count=float('inf')):
    vocabulary = collections.Counter()
    for document in tokenized_corpus:
        vocabulary.update(document)
    vocabulary = {word for word, count in vocabulary.items()
                  if count >= min_count and count <= max_count}
    return sorted(vocabulary)

vocabulary = extract_vocabulary(plays_tok, min_count=2)
len(vocabulary)

37824

In [39]:
# Representation in a doc x term matrix for the French plays
def corpus2dtm(tokenized_corpus, vocabulary):
    "Transform a tokenized corpus into a document-term matrix"
    document_term_matrix = []
    for document in tokenized_corpus:
        document_counts = collections.Counter(document)
        row = [document_counts[word] for word in vocabulary]
        document_term_matrix.append(row)
    return document_term_matrix

# building the doc/term matrix in a few seconds for the French example
document_term_matrix = np.array(corpus2dtm(plays_tok, vocabulary))
print(f"document-term matrix with "
      f"|D| = {document_term_matrix.shape[0]} documents and "
      f"|V| = {document_term_matrix.shape[1]} words.")


document-term matrix with |D| = 498 documents and |V| = 37824 words.


Now we can calculate the tf component, which is the occurrence frequency of a term in the whole corpus. For this, we stem the words.

In [40]:
import operator
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
fr = SnowballStemmer('french')

stemmed_vocabulary = [fr.stem(word) for word in vocabulary]

counts = dict()

for word in stemmed_vocabulary:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict = counts
for key, value in counts.items():
    tf_dict[key] = value/len(stemmed_vocabulary)

tf_dict

{'a': 2.643824027072758e-05,
 'aa': 2.643824027072758e-05,
 'aah': 2.643824027072758e-05,
 'aaron': 2.643824027072758e-05,
 'ab': 2.643824027072758e-05,
 'abaiss': 0.00042301184433164127,
 'abaissement': 5.287648054145516e-05,
 'abaissent': 2.643824027072758e-05,
 'abandon': 0.0006609560067681895,
 'abandonnent': 2.643824027072758e-05,
 'abandonnon': 2.643824027072758e-05,
 'abat': 5.287648054145516e-05,
 'abatt': 0.0001321912013536379,
 'abattent': 2.643824027072758e-05,
 'abattr': 2.643824027072758e-05,
 'abattu': 7.931472081218273e-05,
 'abattus': 2.643824027072758e-05,
 'abbess': 2.643824027072758e-05,
 'abbé': 5.287648054145516e-05,
 'abces': 2.643824027072758e-05,
 'abdened': 2.643824027072758e-05,
 'abder': 2.643824027072758e-05,
 'abdiqu': 7.931472081218273e-05,
 'abdit': 2.643824027072758e-05,
 'abdérit': 2.643824027072758e-05,
 'abec': 2.643824027072758e-05,
 'abeil': 5.287648054145516e-05,
 'abencerrag': 2.643824027072758e-05,
 'abendax': 2.643824027072758e-05,
 'abez': 2.64

##### Q2: For each genre, it is possible to generate a “profile”, in the form of a single vector representing the entire set of plays corresponding to this genre. Build such a profile for each of the three genres (Comedy, Tragedy and Tragicomedy).



In [41]:
plays_com, titles_com, genres_com = [], [], []
authors_com, years_com = [], []

plays_tra, titles_tra, genres_tra = [], [], []
authors_tra, years_tra = [], []

plays_tra_com, titles_tra_com, genres_tra_com = [], [], []
authors_tra_com, years_tra_com = [], []
for fn in os.scandir('theatre-classique'):
    # Only include XML files
    if not fn.name.endswith('.xml'):
        continue
    tree = lxml.etree.parse(fn.path)
    genre = tree.find('//genre')
    title = tree.find('//title')
    author = tree.find('//author')
    year = tree.find('//date')
    if genre is not None and genre.text == 'Comédie':
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays_com.append(text)
        genres_com.append(genre.text)
        titles_com.append(title.text)
        authors_com.append(author.text)
        if year is not None:
            years_com.append(year.text)
    elif genre is not None and genre.text == 'Tragédie':
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays_tra.append(text)
        genres_tra.append(genre.text)
        titles_tra.append(title.text)
        authors_tra.append(author.text)
        if year is not None:
            years_tra.append(year.text)
    elif genre is not None and genre.text == 'Tragi-comédie':
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays_tra_com.append(text)
        genres_tra_com.append(genre.text)
        titles_tra_com.append(title.text)
        authors_tra_com.append(author.text)
        if year is not None:
            years_tra_com.append(year.text)

plays_com_tok = [preprocess_text(play, 'French') for play in plays_com]
plays_tra_tok = [preprocess_text(play, 'French') for play in plays_tra]
plays_tra_com_tok = [preprocess_text(play, 'French') for play in plays_tra_com]
vocabulary_com = extract_vocabulary(plays_com_tok, min_count=2)
vocabulary_tra = extract_vocabulary(plays_tra_tok, min_count=2)
vocabulary_tra_com = extract_vocabulary(plays_tra_com_tok, min_count=2)

In [42]:
stemmed_vocabulary_com = [fr.stem(word) for word in vocabulary_com]
counts = dict()
for word in stemmed_vocabulary_com:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict_com = counts
for key, value in counts.items():
    tf_dict_com[key] = value / len(stemmed_vocabulary_com)
tf_dict_com

{'a': 3.389141191622043e-05,
 'aah': 3.389141191622043e-05,
 'ab': 3.389141191622043e-05,
 'abaiss': 6.778282383244086e-05,
 'abaissement': 3.389141191622043e-05,
 'abandon': 0.0006100454144919677,
 'abandonnent': 3.389141191622043e-05,
 'abandonnon': 3.389141191622043e-05,
 'abat': 6.778282383244086e-05,
 'abatt': 3.389141191622043e-05,
 'abattr': 3.389141191622043e-05,
 'abattu': 6.778282383244086e-05,
 'abattus': 3.389141191622043e-05,
 'abbess': 3.389141191622043e-05,
 'abbé': 6.778282383244086e-05,
 'abces': 3.389141191622043e-05,
 'abder': 3.389141191622043e-05,
 'abdérit': 3.389141191622043e-05,
 'abec': 3.389141191622043e-05,
 'abeil': 3.389141191622043e-05,
 'abez': 3.389141191622043e-05,
 'abhorr': 6.778282383244086e-05,
 'abject': 3.389141191622043e-05,
 'abjur': 0.00013556564766488172,
 'ablativo': 3.389141191622043e-05,
 'abois': 3.389141191622043e-05,
 'abol': 6.778282383244086e-05,
 'abomin': 0.00010167423574866129,
 'abond': 0.00016945705958110215,
 'abon': 3.3891411916

In [43]:
stemmed_vocabulary_tra = [fr.stem(word) for word in vocabulary_tra]
counts = dict()
for word in stemmed_vocabulary_tra:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict_tra = counts
for key, value in counts.items():
    tf_dict_tra[key] = value / len(stemmed_vocabulary_tra)
tf_dict_tra

{'a': 5.0020008003201284e-05,
 'aaron': 5.0020008003201284e-05,
 'ab': 5.0020008003201284e-05,
 'abaiss': 0.0007002801120448179,
 'abaissement': 0.00010004001600640257,
 'abaissent': 5.0020008003201284e-05,
 'abandon': 0.0010504201680672268,
 'abandonnent': 5.0020008003201284e-05,
 'abandonnon': 5.0020008003201284e-05,
 'abat': 5.0020008003201284e-05,
 'abatt': 0.00015006002400960383,
 'abattr': 5.0020008003201284e-05,
 'abattu': 0.00010004001600640257,
 'abattus': 5.0020008003201284e-05,
 'abbé': 5.0020008003201284e-05,
 'abdened': 5.0020008003201284e-05,
 'abdiqu': 0.00015006002400960383,
 'abdit': 5.0020008003201284e-05,
 'abencerrag': 5.0020008003201284e-05,
 'abendax': 5.0020008003201284e-05,
 'abhorr': 0.00035014005602240897,
 'abincerrag': 5.0020008003201284e-05,
 'abject': 0.00010004001600640257,
 'abjur': 0.0002501000400160064,
 'abner': 5.0020008003201284e-05,
 'abois': 5.0020008003201284e-05,
 'abol': 0.00020008003201280514,
 'abomin': 0.00010004001600640257,
 'abond': 0.000

In [44]:
stemmed_vocabulary_tra_com = [fr.stem(word) for word in vocabulary_tra_com]
counts = dict()
for word in stemmed_vocabulary_tra_com:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict_tra_com = counts
for key, value in counts.items():
    tf_dict_tra_com[key] = value / len(stemmed_vocabulary_tra_com)
tf_dict_tra_com

{'a': 9.220839096357768e-05,
 'abaiss': 0.0005532503457814661,
 'abaissement': 9.220839096357768e-05,
 'abandon': 0.0009220839096357768,
 'abandonnent': 9.220839096357768e-05,
 'abat': 9.220839096357768e-05,
 'abattr': 9.220839096357768e-05,
 'abattu': 0.00018441678192715537,
 'abattus': 9.220839096357768e-05,
 'abhorr': 9.220839096357768e-05,
 'abject': 0.00018441678192715537,
 'abois': 9.220839096357768e-05,
 'abomin': 9.220839096357768e-05,
 'abond': 0.00027662517289073305,
 'abord': 0.00036883356385431073,
 'abordon': 9.220839096357768e-05,
 'abri': 9.220839096357768e-05,
 'abreg': 0.00036883356385431073,
 'absenc': 9.220839096357768e-05,
 'absent': 0.00018441678192715537,
 'absolu': 0.00027662517289073305,
 'absolus': 9.220839096357768e-05,
 'absous': 9.220839096357768e-05,
 'absout': 9.220839096357768e-05,
 'abus': 0.0010142923005993546,
 'abusent': 9.220839096357768e-05,
 'abîm': 0.00027662517289073305,
 'accabl': 0.00036883356385431073,
 'accablent': 9.220839096357768e-05,
 'ac

##### Q3: How many terms with a weight strictly larger than 0 do you have in each text genre profile?

This is simply the lengths of the tf vectors (note that we stemmed the terms!).

In [45]:
print("Terms for comedy: ", len(tf_dict_com))
print("Terms for tragedy: ", len(tf_dict_tra))
print("Terms for tragi-comedy: ", len(tf_dict_tra_com))

Terms for comedy:  14179
Terms for tragedy:  8902
Terms for tragi-comedy:  5288


##### Q4: Select randomly 10 plays for each text genre. Represent each play by a vector.

In [46]:
import random

plays_com_tok_rand = [preprocess_text(play, 'French') for play in random.choices(plays_com, k=10)]
plays_tra_tok_rand = [preprocess_text(play, 'French') for play in random.choices(plays_tra, k=10)]
plays_tra_com_tok_rand = [preprocess_text(play, 'French') for play in random.choices(plays_tra_com, k=10)]
vocabulary_com_rand = extract_vocabulary(plays_com_tok_rand, min_count=2)
vocabulary_tra_rand = extract_vocabulary(plays_tra_tok_rand, min_count=2)
vocabulary_tra_com_rand = extract_vocabulary(plays_tra_com_tok_rand, min_count=2)

stemmed_vocabulary_tra_rand = [fr.stem(word) for word in vocabulary_tra_rand]
counts = dict()
for word in vocabulary_tra_rand:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict_tra_rand = counts
for key, value in counts.items():
    tf_dict_tra_rand[key] = value / len(vocabulary_tra_rand)
tf_dict_tra_rand

{'a': 0.00016795431642593214,
 'abaissement': 0.00016795431642593214,
 'abaisser': 0.00016795431642593214,
 'abandonnai': 0.00016795431642593214,
 'abandonne': 0.00016795431642593214,
 'abandonnent': 0.00016795431642593214,
 'abandonner': 0.00016795431642593214,
 'abandonnez': 0.00016795431642593214,
 'abandonné': 0.00016795431642593214,
 'abandonnée': 0.00016795431642593214,
 'abattre': 0.00016795431642593214,
 'abattu': 0.00016795431642593214,
 'abattue': 0.00016795431642593214,
 'abbé': 0.00016795431642593214,
 'abhorre': 0.00016795431642593214,
 'abjure': 0.00016795431642593214,
 'abjurer': 0.00016795431642593214,
 'abolir': 0.00016795431642593214,
 'abominable': 0.00016795431642593214,
 'abonde': 0.00016795431642593214,
 'abord': 0.00016795431642593214,
 'abordant': 0.00016795431642593214,
 'aborder': 0.00016795431642593214,
 'abréger': 0.00016795431642593214,
 'absence': 0.00016795431642593214,
 'absent': 0.00016795431642593214,
 'absente': 0.00016795431642593214,
 'absolu': 0.00

In [47]:
stemmed_vocabulary_com_rand = [fr.stem(word) for word in vocabulary_com_rand]
counts = dict()
for word in vocabulary_com_rand:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict_com_rand = counts
for key, value in counts.items():
    tf_dict_com_rand[key] = value / len(vocabulary_com_rand)
tf_dict_com_rand

{'a': 0.0002098635886673662,
 'abaisse': 0.0002098635886673662,
 'abandonne': 0.0002098635886673662,
 'abandonner': 0.0002098635886673662,
 'abattre': 0.0002098635886673662,
 'abattu': 0.0002098635886673662,
 'abattue': 0.0002098635886673662,
 'abhorre': 0.0002098635886673662,
 'abondance': 0.0002098635886673662,
 'abord': 0.0002098635886673662,
 'aborder': 0.0002098635886673662,
 'aboumékarès': 0.0002098635886673662,
 'abri': 0.0002098635886673662,
 'absence': 0.0002098635886673662,
 'absent': 0.0002098635886673662,
 'absente': 0.0002098635886673662,
 'absenter': 0.0002098635886673662,
 'absents': 0.0002098635886673662,
 'absolu': 0.0002098635886673662,
 'absolument': 0.0002098635886673662,
 'abuse': 0.0002098635886673662,
 'abuser': 0.0002098635886673662,
 'abuserais': 0.0002098635886673662,
 'académie': 0.0002098635886673662,
 'accabler': 0.0002098635886673662,
 'accablée': 0.0002098635886673662,
 'accepter': 0.0002098635886673662,
 'accepterai': 0.0002098635886673662,
 'acceptez': 

In [48]:
stemmed_vocabulary_tra_com_rand = [fr.stem(word) for word in vocabulary_tra_com_rand]
counts = dict()
for word in vocabulary_tra_com_rand:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

tf_dict_tra_com_rand = counts
for key, value in counts.items():
    tf_dict_tra_com_rand[key] = value / len(vocabulary_tra_com_rand)
tf_dict_tra_com_rand

{'a': 0.00016949152542372882,
 'abaisse': 0.00016949152542372882,
 'abaissement': 0.00016949152542372882,
 'abaisser': 0.00016949152542372882,
 'abandonnant': 0.00016949152542372882,
 'abandonne': 0.00016949152542372882,
 'abandonner': 0.00016949152542372882,
 'abandonnez': 0.00016949152542372882,
 'abandonné': 0.00016949152542372882,
 'abattre': 0.00016949152542372882,
 'abattu': 0.00016949152542372882,
 'abattus': 0.00016949152542372882,
 'abhorre': 0.00016949152542372882,
 'abois': 0.00016949152542372882,
 'abominable': 0.00016949152542372882,
 'abonde': 0.00016949152542372882,
 'abord': 0.00016949152542372882,
 'aborder': 0.00016949152542372882,
 'abri': 0.00016949152542372882,
 'abréger': 0.00016949152542372882,
 'abrégé': 0.00016949152542372882,
 'absence': 0.00016949152542372882,
 'absent': 0.00016949152542372882,
 'absolu': 0.00016949152542372882,
 'absolue': 0.00016949152542372882,
 'absolument': 0.00016949152542372882,
 'absolus': 0.00016949152542372882,
 'abuser': 0.00016949

##### Q5: For each text genre and play, how many terms with a weight strictly larger than 0 do you have in the vector?

This is again simply the lengths of the tf vectors as all words occur at least twice.

In [49]:
print("Terms for comedy: ", len(tf_dict_com))
print("Terms for tragedy: ", len(tf_dict_tra))
print("Terms for tragi-comedy: ", len(tf_dict_tra_com))

Terms for comedy:  14179
Terms for tragedy:  8902
Terms for tragi-comedy:  5288


##### Q6: For each text genre and play, how many terms with a weight strictly equal to 1 do you have in the vector?

With tf, none of them will have a weight of 1, but we can look at the max values:


In [50]:
# print(max(tf_dict_com.items(), key=operator.itemgetter(1))[0])
# max(tf_dict_com.values())
[k for k,v in tf_dict_com.items() if v == max(tf_dict_com.values())]

['jou']

In [51]:
# max(tf_dict_tra.items(), key=operator.itemgetter(1))[0]
# max(tf_dict_tra.values())
[k for k,v in tf_dict_tra.items() if v == max(tf_dict_tra.values())]

['jou']

In [52]:
# max(tf_dict_tra_com.items(), key=operator.itemgetter(1))[0]
# max(tf_dict_tra_com.values())
[k for k,v in tf_dict_tra_com.items() if v == max(tf_dict_tra_com.values())]

['don']