# Data import
## Question 0 - Get common wikidata occupations

> Write a sparql query that retrieves the top 100 occupations on wikidata (wikidata property P106).

You may use the interface https://query.wikidata.org/ to try different queries. Here are some example sparql queries: https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples

In [1]:
query = """
SELECT ?o WHERE { ?p wdt:P106 ?o. } GROUP BY ?o ORDER BY DESC(COUNT(?p)) LIMIT 100
"""

The following assertion should pass if your answer is correct.

In [3]:
import requests

occupations = ['Q82955', 'Q937857', 'Q36180', 'Q33999', 'Q1650915', 'Q1028181', 'Q1930187', 'Q177220', 'Q1622272', 'Q49757', 'Q36834', 'Q40348', 'Q47064', 'Q639669', 'Q10800557', 'Q201788', 'Q2526255', 'Q43845', 'Q28389', 'Q42973', 'Q10871364', 'Q39631', 'Q193391', 'Q482980', 'Q483501', 'Q11513337', 'Q3665646', 'Q12299841', 'Q19204627', 'Q16533', 'Q81096', 'Q11774891', 'Q188094', 'Q1281618', 'Q333634', 'Q189290', 'Q250867', 'Q33231', 'Q2259451', 'Q42603', 'Q628099', 'Q37226', 'Q2309784', 'Q901', 'Q2066131', 'Q6625963', 'Q10798782', 'Q2374149', 'Q170790', 'Q4610556', 'Q185351', 'Q486748', 'Q3055126', 'Q753110', 'Q4964182', 'Q169470', 'Q158852', 'Q1234713', 'Q14089670', 'Q10873124', 'Q3282637', 'Q593644', 'Q947873', 'Q13414980', 'Q131524', 'Q11338576', 'Q15117302', 'Q488205', 'Q14467526', 'Q183945', 'Q10843402', 'Q13382576', 'Q13141064', 'Q214917', 'Q855091', 'Q644687', 'Q19595175', 'Q121594', 'Q2865819', 'Q16010345', 'Q1231865', 'Q2405480', 'Q350979', 'Q3400985', 'Q13365117', 'Q10833314', 'Q3621491', 'Q15981151', 'Q212980', 'Q16145150', 'Q1792450', 'Q15296811', 'Q15627169', 'Q2306091', 'Q4263842', 'Q806798', 'Q5716684', 'Q2516866', 'Q3387717', 'Q131512']

def evalSparql(query):
    return requests.post('https://query.wikidata.org/sparql', data=query, headers={
        'content-type': 'application/sparql-query',
        'accept': 'application/json',
        'user-agent': 'User:Tpt'
    }).json()['results']['bindings']

myOccupations = [val['o']['value'].replace('http://www.wikidata.org/entity/', '') 
                 for val in evalSparql(query)]
for i in occupations:
    if i not in  myOccupations:
        print(i)
for i in myOccupations:
    if i not in  occupations:
        print(i)
assert(frozenset(occupations) == frozenset(myOccupations))

Q3387717
Q212238


AssertionError: 

## Occupations labels

We load the labels of the occupations from Wikidata

In [105]:
occupations_label = {}

query = """
SELECT DISTINCT ?o ?oLabel 
WHERE { 
    VALUES ?o { %s } 
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}"""% ' '.join('wd:' + o for o in occupations)
# print(query)
for result in evalSparql(query):
    occupations_label[result['o']['value'].replace('http://www.wikidata.org/entity/', '')] = result['oLabel']['value']

print(occupations_label)
# evalSparql(query)

{'Q82955': 'politician', 'Q121594': 'professor', 'Q177220': 'singer', 'Q169470': 'physicist', 'Q170790': 'mathematician', 'Q81096': 'engineer', 'Q201788': 'historian', 'Q188094': 'economist', 'Q212980': 'psychologist', 'Q214917': 'playwright', 'Q131524': 'entrepreneur', 'Q183945': 'record producer', 'Q193391': 'diplomat', 'Q189290': 'military officer', 'Q185351': 'jurist', 'Q350979': 'zoologist', 'Q483501': 'artist', 'Q482980': 'author', 'Q333634': 'translator', 'Q158852': 'conductor', 'Q486748': 'pianist', 'Q488205': 'singer-songwriter', 'Q250867': 'Catholic priest', 'Q593644': 'chemist', 'Q639669': 'musician', 'Q644687': 'illustrator', 'Q628099': 'association football manager', 'Q855091': 'guitarist', 'Q937857': 'association football player', 'Q947873': 'television presenter', 'Q806798': 'banker', 'Q1028181': 'painter', 'Q753110': 'songwriter', 'Q1234713': 'theologian', 'Q1281618': 'sculptor', 'Q1622272': 'university teacher', 'Q1792450': 'art historian', 'Q1650915': 'researcher', 'Q

We load *all* the labels of the occupations from Wikipedia

In [106]:
occupations_labels = {k: [v] for k, v in occupations_label.items()}

query = """
SELECT ?o ?altLabel 
WHERE {
  VALUES ?o { %s }
  ?o skos:altLabel ?altLabel . FILTER (lang(?altLabel) = "en")
}""" % ' '.join('wd:' + o for o in occupations) 
# print(query)
for result in evalSparql(query):
    occupations_labels[result['o']['value'].replace('http://www.wikidata.org/entity/', '')].append(result['altLabel']['value'])

print(occupations_labels)
# evalSparql(query)

{'Q82955': ['politician', 'political leader', 'polit.', 'political figure'], 'Q121594': ['professor', 'Prof.'], 'Q177220': ['singer', 'vocalist'], 'Q169470': ['physicist'], 'Q170790': ['mathematician'], 'Q81096': ['engineer'], 'Q201788': ['historian', 'historiographer', 'historians'], 'Q188094': ['economist'], 'Q212980': ['psychologist'], 'Q214917': ['playwright', 'scriptwriter', 'dramatist', 'Playwright, dramatist', 'playwrite'], 'Q131524': ['entrepreneur'], 'Q183945': ['record producer', 'music producer'], 'Q193391': ['diplomat'], 'Q189290': ['military officer', 'officer', 'army officer'], 'Q185351': ['jurist'], 'Q350979': ['zoologist', 'zooligist'], 'Q483501': ['artist'], 'Q482980': ['author'], 'Q333634': ['translator'], 'Q158852': ['conductor', 'Conducting'], 'Q486748': ['pianist'], 'Q488205': ['singer-songwriter', 'singer songwriter', 'singer/songwriter', 'singersongwriter'], 'Q250867': ['Catholic priest', 'Roman Catholic priest', 'Catholic presbyter', 'Roman Catholic presbyter'],

## Wikipedia articles

Here we load the training and the testing sets. To save memory space we use a generator that will read the file each time we iterate over the training or the testing examples.

In [107]:
import gzip
import json

def loadJson(filename):
    with gzip.open(filename, 'rt') as fp:
        for line in fp:
            yield json.loads(line)

class MakeIter(object):
    def __init__(self, generator_func, **kwargs):
        self.generator_func = generator_func
        self.kwargs = kwargs
    def __iter__(self):
        return self.generator_func(**self.kwargs)

training_set = MakeIter(loadJson, filename='wiki-train.json.gz')
testing_set = MakeIter(loadJson, filename='wiki-test.json.gz')

# Extract occupations from summaries

## Task 1 - Dictionnary extraction

> Using ```occupations_labels``` dictionnary, identify all occupations for each articles. Complete the function below to evaluate the accuracy of such approach. It will serve as a baseline.

In [108]:
def predict_dictionnary(example, occupations_labels):
    ## example['summary'] contains the summary of the article
    ## Code here
    summary = example['summary'].lower()
#     print(summary)
    hits = set()
#     print(words)
    for (k,v) in occupations_label.items():
        if summary.find(v) != -1:
            hits.add(k)
#             print(v)
#     print("-------------")
#     print(hits)
    
    return hits
    
def evaluate_dictionnary(training_set, occupations_labels):
    nexample = 0
    accuracy = 0.
    prediction = None
    for example in training_set:
#         print(example)
        prediction = predict_dictionnary(example, occupations_labels)
        p = frozenset(prediction)
        g = frozenset(example['occupations'])
        accuracy += 1.*len(p & g) / len(p | g)
        nexample += 1
    return accuracy / nexample

evaluate_dictionnary(training_set, occupations_labels)

0.3682742364200718

## Task 2 - Simple neural network

We load the articles "summary" and we take the average of the word vectors.
This is done with spacy loaded with the fast text vectors.
To do the installation/loading [takes 8-10 minutes, dl 1.2Go]
```
pip3 install spacy
wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/cc.en.300.vec.gz
python3 -m spacy init-model en /tmp/en_vectors_wiki_lg --vectors-loc cc.en.300.vec.gz
rm cc.en.300.vec.gz
```

In [109]:
import spacy
nlp = spacy.load('/tmp/en_vectors_wiki_lg')

def vectorize(dataset, nlp):
    result = {}
    for example in dataset:
        doc = nlp(example['summary'], disable=['parser', 'tagger'])
        result[example['title']] = {}
        result[example['title']]['vector'] = doc.vector
        if 'occupations' in example:
            result[example['title']]['occupations'] = example['occupations']
    return result

vectorized_training = vectorize(training_set, nlp)
vectorized_testing = vectorize(testing_set, nlp)
nlp = None

In [110]:
print(vectorized_training['George_Washington']['vector'])

[-1.45162819e-02 -2.45802402e-02 -4.59302496e-03 -4.09372151e-02
 -4.47662771e-02 -4.18604538e-03 -3.15232435e-03 -1.44802360e-02
 -1.68499984e-02 -3.69651243e-03 -1.16255814e-02  1.43651171e-02
  2.02674349e-03 -5.88953542e-03 -2.17011590e-02  1.02302311e-02
 -2.49313917e-02 -5.65232616e-03 -2.25581434e-02  8.29069968e-03
 -1.44069805e-03  2.25197673e-02 -6.81395701e-04 -1.37232570e-02
 -1.26674427e-02 -3.35569866e-02  1.10627888e-02 -2.37208814e-03
 -2.30000000e-02  7.58616179e-02 -5.03487710e-04 -2.51116175e-02
  9.26511642e-03 -2.52558179e-02 -1.51058156e-02 -9.51627828e-03
  1.17523270e-02  1.22441910e-03  1.08139520e-03  3.39302444e-03
  2.20116391e-03  1.46860480e-02 -1.43686021e-02  5.76395402e-03
  1.74162779e-02 -4.76220921e-02 -1.72569733e-02 -1.49988411e-02
 -1.77732538e-02  1.58907007e-02 -7.23255938e-03  2.43825577e-02
 -2.73104683e-02 -3.67430188e-02 -1.48802334e-02 -1.34825567e-02
 -3.14348824e-02  1.95930228e-02 -6.68605033e-04 -9.24302172e-03
  1.56976283e-04 -1.65674

In [111]:
print(vectorized_training['George_Washington'])

{'vector': array([-1.45162819e-02, -2.45802402e-02, -4.59302496e-03, -4.09372151e-02,
       -4.47662771e-02, -4.18604538e-03, -3.15232435e-03, -1.44802360e-02,
       -1.68499984e-02, -3.69651243e-03, -1.16255814e-02,  1.43651171e-02,
        2.02674349e-03, -5.88953542e-03, -2.17011590e-02,  1.02302311e-02,
       -2.49313917e-02, -5.65232616e-03, -2.25581434e-02,  8.29069968e-03,
       -1.44069805e-03,  2.25197673e-02, -6.81395701e-04, -1.37232570e-02,
       -1.26674427e-02, -3.35569866e-02,  1.10627888e-02, -2.37208814e-03,
       -2.30000000e-02,  7.58616179e-02, -5.03487710e-04, -2.51116175e-02,
        9.26511642e-03, -2.52558179e-02, -1.51058156e-02, -9.51627828e-03,
        1.17523270e-02,  1.22441910e-03,  1.08139520e-03,  3.39302444e-03,
        2.20116391e-03,  1.46860480e-02, -1.43686021e-02,  5.76395402e-03,
        1.74162779e-02, -4.76220921e-02, -1.72569733e-02, -1.49988411e-02,
       -1.77732538e-02,  1.58907007e-02, -7.23255938e-03,  2.43825577e-02,
       -2.7310

In [112]:
# We encode the data

import numpy as np

inputs = np.array([vectorized_training[article]['vector'] for article in vectorized_training])
outputs = np.array([[(1 if occupation in vectorized_training[article]['occupations'] else 0)
                    for occupation in occupations ] for article in vectorized_training])

In [113]:
print(len(outputs[0]))
print(vectorized_training['George_Washington'])

100
{'vector': array([-1.45162819e-02, -2.45802402e-02, -4.59302496e-03, -4.09372151e-02,
       -4.47662771e-02, -4.18604538e-03, -3.15232435e-03, -1.44802360e-02,
       -1.68499984e-02, -3.69651243e-03, -1.16255814e-02,  1.43651171e-02,
        2.02674349e-03, -5.88953542e-03, -2.17011590e-02,  1.02302311e-02,
       -2.49313917e-02, -5.65232616e-03, -2.25581434e-02,  8.29069968e-03,
       -1.44069805e-03,  2.25197673e-02, -6.81395701e-04, -1.37232570e-02,
       -1.26674427e-02, -3.35569866e-02,  1.10627888e-02, -2.37208814e-03,
       -2.30000000e-02,  7.58616179e-02, -5.03487710e-04, -2.51116175e-02,
        9.26511642e-03, -2.52558179e-02, -1.51058156e-02, -9.51627828e-03,
        1.17523270e-02,  1.22441910e-03,  1.08139520e-03,  3.39302444e-03,
        2.20116391e-03,  1.46860480e-02, -1.43686021e-02,  5.76395402e-03,
        1.74162779e-02, -4.76220921e-02, -1.72569733e-02, -1.49988411e-02,
       -1.77732538e-02,  1.58907007e-02, -7.23255938e-03,  2.43825577e-02,
       -2.

> Using keras, define a sequential neural network with two layers. Use categorical_crossentropy as a loss function and softmax as the activation function of the output layer

You can look into the documentation here: https://keras.io/getting-started/sequential-model-guide/

In [114]:
from tensorflow import keras
# from keras.models import Sequential
# from keras.layers import Dense
## Compile the model here
model = keras.models.Sequential([
    keras.layers.Dense(200, input_shape=(300,)),
    keras.layers.Activation('relu'),
    keras.layers.Dense(100),
    keras.layers.Activation('softmax'),
])
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [115]:
## Then train the model on ```inputs``` and ```outputs```
model.fit(inputs,outputs)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x12da80b38>

> Complete the function predict: output the list of occupations where the corresponding neuron on the output layer of our model has a value > 0.1

In [116]:
def predict(model, article_name, vectorized_dataset):
    prediction = []
    ## Code here
#     print(vectorized_training['Elvis_Presley']['vector'])
    result = model.predict(vectorized_dataset[article_name]['vector'].reshape(1,300)).reshape(100,1)
    for i in range(0,len(result)):
        if result[i] > 0.1:
            prediction.append(occupations[i])
    return prediction

# print(predict(model, 'Elvis_Presley', vectorized_training))
# # should be {'Q177220'}
# print(inputs[:1].shape)
# result = model.predict(vectorized_training['George_Washington']['vector'].reshape(1,300)).reshape(100,1)
# # print((result>0.1).*)
# for i in range(0,len(result)):
#     if result[i] > 0.1:
#         print(occupations[i])
# print(result.shape)
# print(occupations)
# print(vectorized_training['George_Washington']['occupations'])

In [117]:
print(vectorized_training['Elvis_Presley']['vector'].shape)
# model.predict(vectorized_training['Elvis_Presley']['vector'])

(300,)


In [118]:
def evaluate_nn(vectorized_training, model):
    nexample = 0
    accuracy = 0.
    prediction = None
    for article_name in vectorized_training:
        prediction = predict(model, article_name, vectorized_training)
        p = frozenset(prediction)
        g = frozenset(vectorized_training[article_name]['occupations'])
        accuracy += 1.*len(p & g) / len(p | g)
        nexample += 1
    return accuracy / nexample
evaluate_nn(vectorized_training, model)

0.5933197136122408

## Task 3 - Your approach

> Propose your own approach (extend previous examples or use original approaches) to improve the accuracy for this task. Apply it to the testing set and put the result as a json file with your submission.

In [119]:
from tensorflow import keras
# from keras.models import Sequential
# from keras.layers import Dense
## Compile the model here
model = keras.models.Sequential([
    keras.layers.Dense(280, input_shape=(300,)),
    keras.layers.Activation('relu'),
    keras.layers.Dense(100),
    keras.layers.Activation('softmax'),
])
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [120]:
model.fit(inputs,outputs)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x110c45c18>

In [125]:
def predict(model, article_name, vectorized_dataset):
    prediction = []
    ## Code here
#     print(vectorized_training['Elvis_Presley']['vector'])
    result = model.predict(vectorized_dataset[article_name]['vector'].reshape(1,300)).reshape(100,1)
    for i in range(0,len(result)):
        if result[i] > 0.2:
            prediction.append(occupations[i])
    return prediction

In [126]:
def evaluate_nn(vectorized_training, model):
    nexample = 0
    accuracy = 0.
    prediction = None
    for article_name in vectorized_training:
        prediction = predict(model, article_name, vectorized_training)
        p = frozenset(prediction)
        g = frozenset(vectorized_training[article_name]['occupations'])
        accuracy += 1.*len(p & g) / len(p | g)
        nexample += 1
    return accuracy / nexample
evaluate_nn(vectorized_training, model)

0.6452663868286385

In [127]:
import spacy
nlp = spacy.load('/tmp/en_vectors_wiki_lg')
training_set_copy = training_set
def vectorize(dataset, nlp):
    result = {}
    for example in dataset:
        doc = nlp(example['summary'], disable=['parser', 'tagger'])
        result[example['title']] = {}
        result[example['title']]['vector'] = doc.vector
        if 'occupations' in example:
            result[example['title']]['occupations'] = example['occupations']
    return result

In [128]:
nlp = spacy.load('/tmp/en_vectors_wiki_lg')

In [129]:
import spacy
import nltk
training_set_copy = training_set
def vectorize1(dataset, nlp):
    result = {}
    for example in training_set_copy:
    #     print(example)
        summary = example['summary']
        new_sentence = ""
        tokens = nltk.word_tokenize(summary)
        t = nltk.pos_tag(tokens)
    #     print(t)
        for i in t:
            if i[1] in ["NN","NNS","NNP","NNPS"]:
                new_sentence += i[0]
                new_sentence += " "
        example['summary'] = new_sentence
        doc = nlp(example['summary'], disable=['parser', 'tagger'])
        result[example['title']] = {}
        result[example['title']]['vector'] = doc.vector
        if 'occupations' in example:
            result[example['title']]['occupations'] = example['occupations']
    return result
vectorized_training1 = vectorize1(training_set_copy, nlp)

In [130]:
import numpy as np

inputs1 = np.array([vectorized_training1[article]['vector'] for article in vectorized_training1])
outputs1 = np.array([[(1 if occupation in vectorized_training1[article]['occupations'] else 0)
                    for occupation in occupations ] for article in vectorized_training1])

In [131]:
from tensorflow import keras
# from keras.models import Sequential
# from keras.layers import Dense
## Compile the model here
model1 = keras.models.Sequential([
    keras.layers.Dense(200, input_shape=(300,)),
    keras.layers.Activation('relu'),
    keras.layers.Dense(100),
    keras.layers.Activation('softmax'),
])
model1.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [132]:
model1.fit(inputs1,outputs1)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x117249c18>

In [137]:
def predict(model, article_name, vectorized_dataset):
    prediction = []
    ## Code here
#     print(vectorized_training['Elvis_Presley']['vector'])
    result = model.predict(vectorized_dataset[article_name]['vector'].reshape(1,300)).reshape(100,1)
    for i in range(0,len(result)):
        if result[i] > 0.25:
            prediction.append(occupations[i])
    return prediction

In [138]:
def evaluate_nn(vectorized_training, model):
    nexample = 0
    accuracy = 0.
    prediction = None
    for article_name in vectorized_training:
        prediction = predict(model, article_name, vectorized_training)
        p = frozenset(prediction)
        g = frozenset(vectorized_training[article_name]['occupations'])
        accuracy += 1.*len(p & g) / len(p | g)
        nexample += 1
    return accuracy / nexample
evaluate_nn(vectorized_training1, model1)

0.6624109967474497

***IMPORTANT*** Output format of requested file 'results.json.gz': each line must be a json string representing a dictionnary:
> ```{ 'title': THE_ARTICLE_NAME, 'prediction': [THE_LIST_OF_OCCUPATIONS]}```

In [14]:
# For example if testset_solutions is a dictionnary: article_name (key) -> prediction_list (value) use this function:
def export(testset_solutions):
    with gzip.open('results.json.gz', 'wt') as output:
        for article in testset_solutions:
            output.write(json.dumps({'title':article, 'prediction':testset_solutions[article]}) + "\n")