## Exercise 5 - Probabilistic models

First name: Brian
<br>
Last name: Schweigler
<br>
Matriculation number: 16-102-071


##### Q1: Represent each play by a vector with only the tf component. You can apply some preprocessing before generating this vector representation.
General imports and setup:

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
import lxml.etree
import os
from scipy import stats
import nltk
import nltk.tokenize
import collections

np.random.seed(6)  # for reproducibility

PUNCT_RE = re.compile(r'[^\w\s]+$')


def is_punct(string):
    """Check if STRING is a punctuation marker or a sequence of
       punctuation markers.
    """
    return PUNCT_RE.match(string) is not None

def preprocess_text(text, language='French', lowercase=True):
    if lowercase:
        text = text.lower()
    if (language == 'French'):
        text = re.sub("-", " ", text)
        text = re.sub("l'", "le ", text)
        text = re.sub("d'", "de ", text)
        text = re.sub("c'", "ce ", text)
        text = re.sub("j'", "je ", text)
        text = re.sub("m'", "me ", text)
        text = re.sub("qu'", "que ", text)
        text = re.sub("'", " ' ", text)
        text = re.sub("quelqu'", "quelque ", text)
        text = re.sub("aujourd'hui", "aujourdhui", text)
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    tokens = [token for token in tokens if not is_punct(token)]
    return tokens


subgenres = ('Comédie', 'Tragédie', 'Tragi-comédie')
plays, titles, genres = [], [], []
authors, years = [], []

for fn in os.scandir('theatre-classique'):
    # Only include XML files
    if not fn.name.endswith('.xml'):
        continue
    tree   = lxml.etree.parse(fn.path)
    genre  = tree.find('//genre')
    title  = tree.find('//title')
    author = tree.find('//author')
    year   = tree.find('//date')
    if genre is not None and genre.text in subgenres:
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays.append(text)
        genres.append(genre.text)
        titles.append(title.text)
        authors.append(author.text)
        if year is not None:
            years.append(year.text)

print (len(plays), len(genres), len(titles), len(authors), len(years))



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
498 498 498 498 208


In [5]:
#nltk.download('punkt')
plays_tok = [preprocess_text(play, 'French') for play in plays]

def extract_vocabulary(tokenized_corpus, min_count=1, max_count=float('inf')):
    vocabulary = collections.Counter()
    for document in tokenized_corpus:
        vocabulary.update(document)
    vocabulary = {word for word, count in vocabulary.items()
                  if count >= min_count and count <= max_count}
    return sorted(vocabulary)

vocabulary = extract_vocabulary(plays_tok, min_count=2)
len(vocabulary)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Brian\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


38410

In [9]:
# Representation in a doc x term matrix for the French plays
def corpus2dtm(tokenized_corpus, vocabulary):
    "Transform a tokenized corpus into a document-term matrix"
    document_term_matrix = []
    for document in tokenized_corpus:
        document_counts = collections.Counter(document)
        row = [document_counts[word] for word in vocabulary]
        document_term_matrix.append(row)
    return document_term_matrix

# building the doc/term matrix in a few seconds for the French example
document_term_matrix = np.array(corpus2dtm(plays_tok, vocabulary))
print(f"document-term matrix with "
      f"|D| = {document_term_matrix.shape[0]} documents and "
      f"|V| = {document_term_matrix.shape[1]} words.")


document-term matrix with |D| = 498 documents and |V| = 38410 words.


Now we can calculate the tf component, which is the occurrence frequency of a term in the whole corpus

In [None]:
# TODO

##### Q2: For each genre, it is possible to generate a “profile”, in the form of a single vector representing the entire set of plays corresponding to this genre. Build such a profile for each of the three genres (Comedy, Tragedy and Tragicomedy).



In [None]:
plays_com, titles_com, genres_com = [], [], []
authors_com, years_com = [], []

plays_tra, titles_tra, genres_tra = [], [], []
authors_tra, years_tra = [], []

plays_tra_com, titles_tra_com, genres_tra_com = [], [], []
authors_tra_com, years_tra_com = [], []
for fn in os.scandir('theatre-classique'):
    # Only include XML files
    if not fn.name.endswith('.xml'):
        continue
    tree   = lxml.etree.parse(fn.path)
    genre  = tree.find('//genre')
    title  = tree.find('//title')
    author = tree.find('//author')
    year   = tree.find('//date')
    if genre is not None and genre.text == 'Comédie':
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays_com.append(text)
        genres_com.append(genre.text)
        titles_com.append(title.text)
        authors_com.append(author.text)
        if year is not None:
            years_com.append(year.text)
    elif genre is not None and genre.text == 'Tragédie':
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays_tra.append(text)
        genres_tra.append(genre.text)
        titles_tra.append(title.text)
        authors_tra.append(author.text)
        if year is not None:
            years_tra.append(year.text)
    elif genre is not None and genre.text == 'Tragi-comédie':
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays_tra_com.append(text)
        genres_tra_com.append(genre.text)
        titles_tra_com.append(title.text)
        authors_tra_com.append(author.text)
        if year is not None:
            years_tra_com.append(year.text)



##### Q3: How many terms with a weight strictly larger than 0 do you have in each text genre profile?

Her we will use the TF IDF for the weighting.

Intuition says this should simply be 1'000?



##### Q4: Select randomly 10 plays for each text genre. Represent each play by a vector.

Hamilton could resemble a Gaussian distribution, (centered around a length of 2250), but for Madison it does not really match.

##### Q5: For each text genre and play, how many terms with a weight strictly larger than 0 do you have in the vector?



blab

##### Q6: For each text genre and play, how many terms with a weight strictly equal to 1 do you have in the vector?

blab