### Extracting Wikipages in different languages

In this notebook, wikipedia pages in 6 different languages (english, arabic, spanish, french, portuguese and russian) for each wiki title in (wikipediaSimilarity353, WikiSRS_relatedness, andWikiSRS_similarity) datasets, and then save the output pages for each dataset in a file.

In [None]:
!pip3 install wikipedia-api 

In [4]:
import torch
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('punkt')
import pickle
import io
import os
import numpy
import wikipediaapi
import re
import time
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /home/azza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
class LoadingWikiPages() : 
    def __init__(self, dataset_name = None):
        self.headers = requests.utils.default_headers()
        self.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        self.wikipedia = wikipediaapi.Wikipedia(language='en',extract_format=wikipediaapi.ExtractFormat.WIKI, headers=self.headers)
        self.dataset = self.read_data_file(dataset_name)
        self.wiki_languages = ['en','ar','es','fr','ru','pt']
        self.wiki_pages = self.get_wikipedia_pages(dataset_name)

    def read_data_file(self, dataset_name):
        '''
        This method reads the dataset file, and also rename some columns names to be consistent with other datasets
        Input : name of the dataset, it could be wikipediaSimilarity353, or WikiSRS_relatedness or WikiSRS_similarity
        Output: a dataframe fot the corresponding dataset name
        '''
        
        if dataset_name == "wikipediaSimilarity353":
            data = pd.read_csv("data/wikipediaSimilarity353.csv")
            data['titleA'] = data['titleA'].replace(['Production, costs, and pricing'],'Production')#no wikipedia page for 'Production, costs, and pricing'
        elif dataset_name == "WikiSRS_relatedness" or dataset_name == "WikiSRS_similarity":
            data = pd.read_csv("data/"+dataset_name+".csv", sep='\t') 
            data = data.drop(['RawScores', 'StdDev'], axis = 1)
            data.rename(columns = {'Term1':'termA', 'String1':'titleA',
                                   'Term2':'termB', 'String2':'titleB',
                                   'Mean' :'relatedness'}, inplace = True)
        return data

    def clean_text(self, text):
        '''
        This method does a light text cleaning, such as removing punctuation and special characters like \ and \n
        '''
        
        text = re.sub("\n", "", text)
        text = re.sub("\'", "", text)
        new_text = re.sub('[!*(-);\':"]', "", text)
        return new_text

    def save_data_in_file(self, dataset, dataset_name):
        '''
        This method saves the dataframe in a file 
        '''
        
        data = pd.DataFrame(dataset)
        data.to_csv('data/multilingual_'+dataset_name+'.csv')
        return
    
    def get_wikipedia_pages(self, dataset_name):
        '''
        This method extract the english wikipedia page for each entity title in the dataset.
        And, get the page in other langauges.
        Input : dataset name.
        output : a list of wikipedia pages in different languages for all entities in the datset. 
        '''
        
        pages = []
        pages_titles = []
        for index, row in self.dataset.iterrows():
            if row['titleA'] not in pages_titles :
                page1 = self.wikipedia.page(row['titleA'])
                try:
                    page1_content = page1.text
                except:
                    time.sleep(15)
                    print("Try again after sleeping for 15 seconds")
                    page1_content = page1.text
                if len(page1_content) > 1000: 
                    pages.append({"title": page1.title, "lang": "en", "content": self.clean_text(page1_content)})
                    pages_titles.append(row['titleA'])
                    soup = BeautifulSoup(urlopen(page1.fullurl))
                    links = [(el.get('lang'), el.get('title')) for el in soup.select('li.interlanguage-link > a')]
                    for lang, title in links:
                        if lang in self.wiki_languages:
                            page_title = title.split(u' – ')[0]
                            wiki1 = wikipediaapi.Wikipedia(language=lang,extract_format=wikipediaapi.ExtractFormat.WIKI)
                            page = wiki1.page(page_title)
                            try:
                                p_content = page.text
                            except:
                                time.sleep(15)
                                print("Try again after sleeping for 15 seconds")
                                p_content = page.text
                            if len(p_content)>1000:
                                pages.append({"title": page1.title,"lang": lang, "content": p_content})
                                time.sleep(2)
                    
                            

            if row['titleB'] not in pages_titles :
                page2 = self.wikipedia.page(row['titleB'])
                try:
                    page2_content = page2.text
                except:
                    time.sleep(15)
                    print("Try again after sleeping for 15 seconds")
                    page2_content = page2.text
                if len(page2_content) > 1000: 
                    pages.append({"title": page2.title, "lang": "en", "content": self.clean_text(page2_content)})
                    pages_titles.append(row['titleB'])
                    soup = BeautifulSoup(urlopen(page2.fullurl))
                    links = [(el.get('lang'), el.get('title')) for el in soup.select('li.interlanguage-link > a')]
                    for lang, title in links:
                        if lang in self.wiki_languages:
                            page_title = title.split(u' – ')[0]
                            wiki1 = wikipediaapi.Wikipedia(language=lang,extract_format=wikipediaapi.ExtractFormat.WIKI)
                            page = wiki1.page(page_title)
                            try:
                                p_content = page.text
                            except:
                                time.sleep(15)
                                print("Try again after sleeping for 15 seconds")
                                p_content = page.text
                            if len(p_content)>1000:
                                pages.append({"title": page2.title,"lang": lang, "content": p_content})
                                time.sleep(2)
        self.save_data_in_file(pages, dataset_name)
        return pages

In [6]:
# run the code for each dataset
wikisrs_sim = LoadingWikiPages(dataset_name="WikiSRS_similarity")

Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds


In [13]:
wikisrs_rel = LoadingWikiPages(dataset_name="WikiSRS_relatedness")

Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds
Try again after sleeping for 15 seconds


In [7]:
wiki_sim353 = LoadingWikiPages(dataset_name="wikipediaSimilarity353")