In [33]:
import wikipediaapi
import requests
import pandas as pd
import random

# Step 1: Crawl a real-world dataset

## Data Source:
The data was sourced from Wikipedia using the `requests` library and the `wikipediaapi`. 

## Variables of Interest:
I mainly focused on 

In [34]:
class WikiDataCrawl:
    def __init__(self):
        self.__user_agent = "Wikipedia_Data_Analytics (nv22224@bristol.ac.uk)"
        self.__titles = []
        self.__wiki_data = pd.DataFrame(columns=['Title','Sumary', 'Sumary Length','Sumary Word Count','Sections Length',
                                                 'Text Length', 'Word Count',
                                                 'Language Count', 'Pageviews'])
        
    def get_wikipedia_page_info(self, title: str):
        # Create a Wikipedia API object
        wiki_wiki = wikipediaapi.Wikipedia(self.__user_agent, 'en')

        # Obtain a page object
        page = wiki_wiki.page(title)

        if not page.exists():
            return None

        # Obtain the length and word count of the page
        page_length = len(page.text)
        page_word_count = len(page.text.split())

        # Construct an API URL to retrieve Wikipedia page information
        api_url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&prop=info|pageviews"
        response = requests.get(api_url)
        data = response.json()

        # Obtain the language count of the page 
        langlinks = page.langlinks
        language_count = len(langlinks)

        try:
            # Obtain the page views
            page_info = data["query"]["pages"]
            page_id = list(page_info.keys())[0]
            pageviews = page_info[page_id]["pageviews"]
            
        except KeyError:
            return None


        # Create a dictionary for storing page information.
        page_data = {
            "Title": page.title,
            "Sumary": page.summary[:100],
            "Sumary Length": len(page.summary),
            "Sumary Word Count": len(page.summary.split()),
            "Sections Length": len(page.sections),
            "Text Length": page_length,
            "Word Count": page_word_count,
            "Language Count": language_count + 1, # English page itself should be included
            "Pageviews": pageviews
        }
        return page_data
    
    def get_random_wikipedia_titles(self, title_nums=150):
        api_url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=random&rnnamespace=0&rnlimit={title_nums}"
        response = requests.get(api_url)
        data = response.json()

        # Obtain the title list
        self.__titles = [page['title'] for page in data['query']['random']]

        return self.__titles 
    
    def __call__(self, title_nums):
        
        title_list = self.get_random_wikipedia_titles(title_nums) 
        
        for title in title_list:
            page_data = self.get_wikipedia_page_info(title)
            if page_data is not None:
                self.__wiki_data.loc[len(self.__wiki_data)] = page_data
        
        return self.__wiki_data

In [35]:
wiki_data_crawl = WikiDataCrawl()
wiki_data = wiki_data_crawl(600)
wiki_data.to_csv('wiki_data.csv', index=False)

# Step 2: Perform data preparation & cleaning

In [36]:
wiki_data = pd.read_csv("wiki_data.csv")

## Check whether the dataset has any missing value

In [37]:
display(wiki_data.head(10))
print("===========================================================================================================")
print(f"wiki_data's shape is {wiki_data.shape}")
print("===========================================================================================================")
display(wiki_data.info())

Unnamed: 0,Title,Sumary,Sumary Length,Sumary Word Count,Sections Length,Text Length,Word Count,Language Count,Pageviews
0,Frederick August Baumbach,Frederick August Baumbach (1753 – 30 November ...,1725,267,0,1725,267,4,"{'2023-10-26': None, '2023-10-27': 1, '2023-10..."
1,The Rat Trap,The Rat Trap (1918) is a four-act drama by Noë...,397,73,8,4439,749,1,"{'2023-10-26': 4, '2023-10-27': 6, '2023-10-28..."
2,Find Another Way,"""Find Another Way"" is a song by German musicia...",246,42,6,1332,224,3,"{'2023-10-26': 3, '2023-10-27': 1, '2023-10-28..."
3,Pteriomorphia,The Pteriomorphia comprise a subclass of saltw...,719,105,4,3869,518,28,"{'2023-10-26': 22, '2023-10-27': 35, '2023-10-..."
4,Danielle Bunten Berry,"Danielle Bunten Berry (February 19, 1949 – Jul...",548,96,11,11283,1913,11,"{'2023-10-26': 58, '2023-10-27': 43, '2023-10-..."
5,City Lights,City Lights is a 1931 American silent romantic...,1805,292,8,27890,4750,53,"{'2023-10-26': 428, '2023-10-27': 506, '2023-1..."
6,Eliza A. Dupuy,"Eliza Ann Dupuy (c. 1814 – December 29, 1880) ...",895,149,5,6433,1037,1,"{'2023-10-26': 0, '2023-10-27': 1, '2023-10-28..."
7,El-Gendi Fortress,The El-Gendi Fortress (Arabic: قلعة الجندي) is...,458,67,3,680,100,6,"{'2023-10-26': 3, '2023-10-27': 20, '2023-10-2..."
8,Bądzsław,Bądzsław [ˈbɔ̃tswaf] is a masculine Old Polish...,196,34,0,196,34,2,"{'2023-10-26': None, '2023-10-27': 0, '2023-10..."
9,Acceptance discography,"The discography of Acceptance, an American alt...",135,20,2,199,29,1,"{'2023-10-26': 4, '2023-10-27': 2, '2023-10-28..."


wiki_data's shape is (499, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              499 non-null    object
 1   Sumary             499 non-null    object
 2   Sumary Length      499 non-null    int64 
 3   Sumary Word Count  499 non-null    int64 
 4   Sections Length    499 non-null    int64 
 5   Text Length        499 non-null    int64 
 6   Word Count         499 non-null    int64 
 7   Language Count     499 non-null    int64 
 8   Pageviews          499 non-null    object
dtypes: int64(6), object(3)
memory usage: 35.2+ KB


None

In [38]:
wiki_data["Pageviews"] = wiki_data["Pageviews"].apply(lambda x: eval(x))

## Create new features

In [39]:
def dict_average(dictionary: dict):
    # Check if dictionary is null
    if dictionary is None:
        return None  

    # Filter null value
    valid_values = [value for value in dictionary.values() if value is not None]
    
    # Check if all values are null
    if valid_values is None:
        return None  

    # Calculate the average
    average = round((sum(valid_values) / len(valid_values)))
    return average

def dict_max(dictionary: dict):
    
    value_set = set()
    
    for value in dictionary.values():
        if value is not None:
            value_set.add(value)
            
    if value_set is not None:
        return max(value_set)
    else:
        return None

wiki_data["Average Pageviews"] = wiki_data["Pageviews"].apply(dict_average)
wiki_data["Max Pageviews"] = wiki_data["Pageviews"].apply(dict_max)

In [40]:
wiki_data.drop(columns=["Pageviews"],axis=1,inplace=True)

In [41]:
wiki_data.head()

Unnamed: 0,Title,Sumary,Sumary Length,Sumary Word Count,Sections Length,Text Length,Word Count,Language Count,Average Pageviews,Max Pageviews
0,Frederick August Baumbach,Frederick August Baumbach (1753 – 30 November ...,1725,267,0,1725,267,4,1,4
1,The Rat Trap,The Rat Trap (1918) is a four-act drama by Noë...,397,73,8,4439,749,1,7,19
2,Find Another Way,"""Find Another Way"" is a song by German musicia...",246,42,6,1332,224,3,3,11
3,Pteriomorphia,The Pteriomorphia comprise a subclass of saltw...,719,105,4,3869,518,28,172,4317
4,Danielle Bunten Berry,"Danielle Bunten Berry (February 19, 1949 – Jul...",548,96,11,11283,1913,11,45,96


## Step 3: Perform exploratory analysis

In [4]:
# page view and time
# page view and language count