## Auteur: Vincent Gauthier

## Imports

In [1]:
from nltk import sent_tokenize, word_tokenize, pos_tag, FreqDist
from nltk.corpus import stopwords
import pandas as pd
import nltk
import collections
import csv
import urllib
import networkx as nx
import numpy as np
import scipy as sp
from numpy import linalg as LA

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vgauthier/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Fonction d'extraction simple de keywords dans du text 

In [3]:
def get_keywords(text, limit=10):
    keywords = []
    tokens = nltk.wordpunct_tokenize(text)
    text = nltk.Text(tokens)
    # Filtre le text pour ne garder que les mots
    words = [w.lower() for w in text if w.isalpha()]
    # Filtre le text pour ne garder que les mots importants, on supprime 'of, the, ..etc'
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in words if w.lower() not in stopwords]
    # on calcul la frequence de chaque mot clé
    freq = FreqDist(content)
    sorted_freq = collections.OrderedDict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
    num_keywords = 0
    for k,v in sorted_freq.items():
        keywords.append(k)
        num_keywords += 1
        if num_keywords >= limit:
            return ", ".join(keywords)
    return ", ".join(keywords)

## Test keyworks  

In [4]:
with open("../Data/Wikipedia/plaintext_articles/Zheng_He.txt", 'r') as textfile:
    text = textfile.read()
    print(get_keywords(text))

chinese, zheng, china, map, ming, emperor, ships, voyages, muslim, malacca


## Parse wikipedia dataset 

In [None]:
from urllib import parse

with open("../Data/wikipedia/articles.tsv") as articlefile:
    wikipedia = []
    wiki_id = {}
    idp = 1
    reader = csv.reader(articlefile)
    for row in reader:
        if row and row[0][0] != "#":
            with open("../Data/wikipedia/plaintext_articles/" + row[0] + ".txt", 'r') as textfile:
                text = textfile.read()
                keywords = get_keywords(text)
            wiki_article = [idp, parse.unquote(row[0]), row[0], keywords]
            wiki_id[row[0]] = idp
            idp += 1
            wikipedia.append(wiki_article)

## Sauvegarde la base de données

In [None]:
wikipedia = pd.DataFrame(wikipedia, columns=['PageID', 'Page Title', 'Page Title Encoded', 'Keywords'])
wikipedia = wikipedia.set_index(['PageID'])
wikipedia.to_pickle("../Data/wikipedia/wikipedia.pkl", protocol=4)
wikipedia.head()

In [23]:
wikipedia.to_parquet('../Data/wikipedia/wikipedia.parquet.gzip', compression='gzip')
wikipedia.to_csv('../Data/wikipedia/wikipedia.csv', index=True)

## Savegarde le graphe des leiens entre page wikipedia 

In [10]:
with open('../Data/wikipedia/links.tsv') as links:
    G = nx.DiGraph()
    for idx, row in wikipedia.iterrows():
        G.add_node(idx)
    reader = csv.reader(links, delimiter='\t')
    for row in reader:
        if row and row[0][0] != "#":
            G.add_edge(wiki_id[row[0]], wiki_id[row[1]])
            G.nodes[wiki_id[row[0]]]['title'] = parse.unquote(row[0])
            G.nodes[wiki_id[row[1]]]['title'] = parse.unquote(row[1])

In [11]:
nx.write_gpickle(G, "../Data/wikipedia/wikipedia.pickle")

In [12]:
nx.write_graphml(G, "../Data/wikipedia/wikipedia.graphml")