In [1]:
!pip3 install wikipedia-api 

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import torch
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')
import pickle
import io
import os
import numpy
import wikipediaapi
import re
import time
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /home/azza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
class LoadingWikiPages() : 
    def __init__(self, dataset_name = None):
        self.headers = requests.utils.default_headers()
        self.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        self.wikipedia = wikipediaapi.Wikipedia(language='en',extract_format=wikipediaapi.ExtractFormat.WIKI, headers=self.headers)
        self.dataset = self.read_data_file(dataset_name)
        self.wiki_languages = ['en','ar','es','fr','ru','pt']
        self.wiki_pages = self.get_wikipedia_pages(dataset_name)

    def read_data_file(self, dataset_name):
        '''
        This method reads the dataset file, and also rename some columns names to be consistent with other datasets
        Input : name of the dataset, it could be wikipediaSimilarity353, or WikiSRS_relatedness or WikiSRS_similarity
        Output: a dataframe for the corresponding input dataset name
        '''
        
        if dataset_name == "wikipediaSimilarity353":
            data = pd.read_csv("raw_data/wikipediaSimilarity353.csv")
            data['titleA'] = data['titleA'].replace(['Production, costs, and pricing'],'Production')#no wikipedia page for 'Production, costs, and pricing'
        elif dataset_name == "WikiSRS_relatedness" or dataset_name == "WikiSRS_similarity":
            data = pd.read_csv("raw_data/"+dataset_name+".csv", sep='\t') 
            data = data.drop(['RawScores', 'StdDev'], axis = 1)
            data.rename(columns = {'Term1':'termA', 'String1':'titleA',
                                   'Term2':'termB', 'String2':'titleB',
                                   'Mean' :'relatedness'}, inplace = True)
        return data

    def clean_text(self, text):
        '''
        This method does a light text cleaning, such as removing punctuation and special characters like \ and \n
        '''
        
        text = re.sub("\n", "", text)
        text = re.sub("\'", "", text)
        new_text = re.sub('[!*(-);\':"]', "", text)
        return new_text

    def save_data_in_file(self, dataset, dataset_name):
        '''
        This method saves the generated multilingual data in a file 
        '''
        
        data = pd.DataFrame(dataset)
        data.to_csv('data/multilingual_'+datasetw_name+'.csv')
        return
    
    def get_wikipedia_pages(self, dataset_name):
        '''
        This method extract the english wikipedia page for each entity title in the dataset.
        Then, it pulls the links of the same wikipedia title pages in different languages.
        Finally, it extracts the wikipeia articles for each link. 
        Input : dataset name.
        output : a list of wikipedia pages in different languages for all entities in the datset. 
        '''
        
        pages = []
        pages_titles = []
        for index, row in tqdm(self.dataset.iterrows(), desc="Extracting Multilingual Pages"):
            if row['titleA'] not in pages_titles :
                page1 = self.wikipedia.page(row['titleA'])
                try:
                    page1_content = page1.text
                except:
                    time.sleep(15)
                    print("Try again after sleeping for 15 seconds")
                    page1_content = page1.text
                if len(page1_content) > 1000: 
                    pages.append({"title": page1.title, "lang": "en", "content": self.clean_text(page1_content)})
                    pages_titles.append(row['titleA'])
                    soup = BeautifulSoup(urlopen(page1.fullurl))
                    links = [(el.get('lang'), el.get('title')) for el in soup.select('li.interlanguage-link > a')]
                    for lang, title in links:
                        if lang in self.wiki_languages:
                            page_title = title.split(u' – ')[0]
                            wiki1 = wikipediaapi.Wikipedia(language=lang,extract_format=wikipediaapi.ExtractFormat.WIKI)
                            page = wiki1.page(page_title)
                            try:
                                p_content = page.text
                            except:
                                time.sleep(15)
                                print("Try again after sleeping for 15 seconds")
                                p_content = page.text
                            if len(p_content)>1000:
                                pages.append({"title": page1.title,"lang": lang, "content": p_content})
                                time.sleep(2)
                    
                            

            if row['titleB'] not in pages_titles :
                page2 = self.wikipedia.page(row['titleB'])
                try:
                    page2_content = page2.text
                except:
                    time.sleep(15)
                    print("Try again after sleeping for 15 seconds")
                    page2_content = page2.text
                if len(page2_content) > 1000: 
                    pages.append({"title": page2.title, "lang": "en", "content": self.clean_text(page2_content)})
                    pages_titles.append(row['titleB'])
                    soup = BeautifulSoup(urlopen(page2.fullurl))
                    links = [(el.get('lang'), el.get('title')) for el in soup.select('li.interlanguage-link > a')]
                    for lang, title in links:
                        if lang in self.wiki_languages:
                            page_title = title.split(u' – ')[0]
                            wiki1 = wikipediaapi.Wikipedia(language=lang,extract_format=wikipediaapi.ExtractFormat.WIKI)
                            page = wiki1.page(page_title)
                            try:
                                p_content = page.text
                            except:
                                time.sleep(15)
                                print("Try again after sleeping for 15 seconds")
                                p_content = page.text
                            if len(p_content)>1000:
                                pages.append({"title": page2.title,"lang": lang, "content": p_content})
                                time.sleep(2)
        self.save_data_in_file(pages, dataset_name)
        return pages

In [None]:
# Extract the multilingual Wikipedia articles for WikiSRS-similarity dataset
wikisrs_sim = LoadingWikiPages(dataset_name="WikiSRS_similarity")

Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds


In [13]:
# Extract the multilingual Wikipedia articles for WikiSRS-relatedness dataset
wikisrs_rel = LoadingWikiPages(dataset_name="WikiSRS_relatedness")

Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds


In [7]:
# Extract the multilingual Wikipedia articles for wikipediaSimilarity353 dataset
wiki_sim353 = LoadingWikiPages(dataset_name="wikipediaSimilarity353")

In [4]:
wikisrs_sim = pd.read_csv("data/multilingual_WikiSRS_similarity.csv")
wikisrs_rel = pd.read_csv("data/multilingual_WikiSRS_relatedness.csv")
wiki_sim353 = pd.read_csv("data/multilingual_wikipediaSimilarity353.csv")

In [7]:
wiki_sim353

Unnamed: 0.1,Unnamed: 0,title,lang,content
0,0,Love,en,Love encompasses a range of strong and positiv...
1,1,Love,ar,الحب هو مجموعة متنوعة من المشاعر الإيجابيَّة و...
2,2,Love,es,El amor es un concepto universal relativo a la...
3,3,Love,fr,L'amour désigne un sentiment intense d'affecti...
4,4,Love,pt,Amor (do latim amore) é uma emoção ou sentimen...
...,...,...,...,...
2069,2069,Office,en,An office is a space where an organizations em...
2070,2070,Office,ar,المَكْتَب (الجمع: مَكَاتِب) هو بشكل عام عبارة ...
2071,2071,Office,es,Una oficina es un salón destinado al trabajo. ...
2072,2072,Office,fr,"En immobilier, un bureau est un espace où s’ac..."


In [6]:
wikisrs_rel

Unnamed: 0.1,Unnamed: 0,title,lang,content
0,0,Moscow,en,"Moscow MOS-koh, US chiefly MOS-kow Russian М..."
1,1,Moscow,ar,موسكو (بالروسية: Москва) (وتنطق: ماسْكڤَا) هي ...
2,2,Moscow,es,"Moscú (en ruso, Москва́, pronunciado /mɐˈskva/..."
3,3,Moscow,fr,"Moscou (en russe : Москва, Moskva, [mɐˈskva] )..."
4,4,Moscow,pt,Moscou (português brasileiro) ou Moscovo (port...
...,...,...,...,...
3137,3137,Albania,ar,ألبانيا (بالألبانية: Shqipëri) أو رسمياً جمهور...
3138,3138,Albania,es,"Albania (en albanés, Shqipëri o Shqipëria), of..."
3139,3139,Albania,fr,"L'Albanie, en forme longue la république d'Alb..."
3140,3140,Albania,pt,"A Albânia (em albanês: Shqipëri/Shqipëria), o..."


In [8]:
wikisrs_sim

Unnamed: 0.1,Unnamed: 0,title,lang,content
0,0,Ferrari,en,Ferrari S.p.A. Italian [ferˈraːri] is an Ital...
1,1,Ferrari,ar,فيراري (بالإنجليزية: Ferrari)‏ هي شركة إيطالية...
2,2,Ferrari,es,Ferrari es una compañía de automóviles deporti...
3,3,Ferrari,fr,Ferrari S.p.A. est un constructeur automobile ...
4,4,Ferrari,pt,Ferrari é uma fabricante italiana de carros es...
...,...,...,...,...
3150,3150,Miley Cyrus,ar,مايلي ري سايرس (بالإنجليزية: Miley Ray Cyrus؛...
3151,3151,Miley Cyrus,es,Miley Ray Cyrus (nacida como Destiny Hope Cyru...
3152,3152,Miley Cyrus,fr,"Destiny Hope Cyrus dite Miley Cyrus, née le 23..."
3153,3153,Miley Cyrus,pt,"Miley Ray Cyrus (Nashville, 23 de novembro de ..."


In [11]:
frames = [wiki_sim353, wikisrs_rel, wikisrs_sim]
docsim3 = pd.concat(frames)
docsim3 = docsim3.drop('Unnamed: 0', axis=1)

In [12]:
docsim3

Unnamed: 0,title,lang,content
0,Love,en,Love encompasses a range of strong and positiv...
1,Love,ar,الحب هو مجموعة متنوعة من المشاعر الإيجابيَّة و...
2,Love,es,El amor es un concepto universal relativo a la...
3,Love,fr,L'amour désigne un sentiment intense d'affecti...
4,Love,pt,Amor (do latim amore) é uma emoção ou sentimen...
...,...,...,...
3150,Miley Cyrus,ar,مايلي ري سايرس (بالإنجليزية: Miley Ray Cyrus؛...
3151,Miley Cyrus,es,Miley Ray Cyrus (nacida como Destiny Hope Cyru...
3152,Miley Cyrus,fr,"Destiny Hope Cyrus dite Miley Cyrus, née le 23..."
3153,Miley Cyrus,pt,"Miley Ray Cyrus (Nashville, 23 de novembro de ..."
