In [2]:
import wikipediaapi
import requests
import pandas as pd
import random

# Step 1: Crawl a real-world dataset

## Data Source:
The data was sourced from Wikipedia using the `requests` library and the `wikipediaapi`. 

## Variables of Interest:
I mainly focused on 

In [2]:
class WikiDataCrawl:
    def __init__(self):
        self.__user_agent = "Wikipedia_Data_Analytics (nv22224@bristol.ac.uk)"
        self.__titles = []
        self.__wiki_data = pd.DataFrame(columns=['Title','Sumary', 'Sumary Length','Sections Length',
                                                 'Text Length', 'Word Count',
                                                 'Language Count', 'Pageviews'])
        
    def get_wikipedia_page_info(self, title: str):
        # Create a Wikipedia API object
        wiki_wiki = wikipediaapi.Wikipedia(self.__user_agent, 'en')

        # Obtain a page object
        page = wiki_wiki.page(title)

        if not page.exists():
            return None

        # Obtain the length and word count of the page
        page_length = len(page.text)
        page_word_count = len(page.text.split())

        # Construct an API URL to retrieve Wikipedia page information
        api_url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&prop=info|pageviews"
        response = requests.get(api_url)
        data = response.json()

        # Obtain the language count of the page 
        langlinks = page.langlinks
        language_count = len(langlinks)

        try:
            # Obtain the page views
            page_info = data["query"]["pages"]
            page_id = list(page_info.keys())[0]
            pageviews = page_info[page_id]["pageviews"]
            
        except KeyError:
            return None


        # Create a dictionary for storing page information.
        page_data = {
            "Title": page.title,
            "Sumary": page.summary[:100],
            "Sumary Length": len(page.summary),
            "Sections Length": len(page.sections),
            "Text Length": page_length,
            "Word Count": page_word_count,
            "Language Count": language_count + 1, # English page itself should be included
            "Pageviews": pageviews
        }
        return page_data
    
    def get_random_wikipedia_titles(self, title_nums=150):
        api_url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=random&rnnamespace=0&rnlimit={title_nums}"
        response = requests.get(api_url)
        data = response.json()

        # Obtain the title list
        self.__titles = [page['title'] for page in data['query']['random']]

        return self.__titles 
    
    def __call__(self, title_nums):
        
        title_list = self.get_random_wikipedia_titles(title_nums) 
        
        for title in title_list:
            page_data = self.get_wikipedia_page_info(title)
            if page_data is not None:
                self.__wiki_data.loc[len(self.__wiki_data)] = page_data
        
        return self.__wiki_data

In [3]:
wiki_data_crawl = WikiDataCrawl()
wiki_data = wiki_data_crawl(600)
wiki_data.to_csv('wiki_data.csv', index=False)

# Step 2: Perform data preparation & cleaning

In [3]:
wiki_data = pd.read_csv("wiki_data.csv")

## Check whether the dataset has any missing value

In [5]:
display(wiki_data.head(10))
print("===========================================================================================================")
print(f"wiki_data's shape is {wiki_data.shape}")
print("===========================================================================================================")
display(wiki_data.info())

Unnamed: 0,Title,Sumary,Sumary Length,Sections Length,Text Length,Word Count,Language Count,Pageviews
0,"Válber (footballer, born 1981)",Válber Mendes Ferreira or simply Válber (born ...,115,2,383,54,5,"{'2023-10-24': 0, '2023-10-25': 0, '2023-10-26..."
1,Agogo Presbyterian College of Education,Agogo Presbyterian College of Education is a t...,492,1,2538,383,1,"{'2023-10-24': 3, '2023-10-25': 2, '2023-10-26..."
2,Macara pasaleuca,Macara pasaleuca is a moth of the family Megal...,142,0,142,25,3,"{'2023-10-24': None, '2023-10-25': 0, '2023-10..."
3,Majhariya,Majhariya is a village and Village Development...,230,2,303,49,7,"{'2023-10-24': None, '2023-10-25': 0, '2023-10..."
4,India national football team records and stati...,The following is a list of the India national ...,1861,7,3629,577,1,"{'2023-10-24': 60, '2023-10-25': 66, '2023-10-..."
5,Iakovos Kambanellis,Iakovos Kambanellis (Greek: Ιάκωβος Καμπανέλλη...,149,6,2822,451,13,"{'2023-10-24': 12, '2023-10-25': 12, '2023-10-..."
6,Nickelodeon Australian Kids' Choice Awards 2009,The 7th annual Australian Nickelodeon Kids' Ch...,796,5,2882,455,1,"{'2023-10-24': 0, '2023-10-25': 2, '2023-10-26..."
7,Paramalungia,Paramalungia is an extinct genus from a well-k...,289,0,289,47,1,"{'2023-10-24': None, '2023-10-25': None, '2023..."
8,"Mikazuki, Hyōgo","Mikazuki (三日月町, Mikazuki-chō) was a town locat...",359,1,413,70,4,"{'2023-10-24': None, '2023-10-25': 2, '2023-10..."
9,Sudoku graph,"In the mathematics of Sudoku, the Sudoku graph...",363,4,5601,566,4,"{'2023-10-24': 5, '2023-10-25': 7, '2023-10-26..."


wiki_data's shape is (498, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title            498 non-null    object
 1   Sumary           498 non-null    object
 2   Sumary Length    498 non-null    int64 
 3   Sections Length  498 non-null    int64 
 4   Text Length      498 non-null    int64 
 5   Word Count       498 non-null    int64 
 6   Language Count   498 non-null    int64 
 7   Pageviews        498 non-null    object
dtypes: int64(5), object(3)
memory usage: 31.3+ KB


None

In [18]:
wiki_data["Pageviews"] = wiki_data["Pageviews"].apply(lambda x: eval(x))

## Create new features

In [42]:
def dict_average(dictionary: dict):
    # Check if dictionary is null
    if dictionary is None:
        return None  

    # Filter null value
    valid_values = [value for value in dictionary.values() if value is not None]
    
    # Check if all values are null
    if valid_values is None:
        return None  

    # Calculate the average
    average = sum(valid_values) / len(valid_values)
    return average

def dict_max(dictionary: dict):
    
    value_set = set()
    
    for value in dictionary.values():
        if value is not None:
            value_set.add(value)
            
    if value_set is not None:
        return max(value_set)
    else:
        return None

wiki_data["Average Pageviews"] = wiki_data["Pageviews"].apply(dict_average)
wiki_data["Max Pageviews"] = wiki_data["Pageviews"].apply(dict_max)

In [43]:
wiki_data

Unnamed: 0,Title,Sumary,Sumary Length,Sections Length,Text Length,Word Count,Language Count,Pageviews,Average Pageviews,Max Pageviews
0,"Válber (footballer, born 1981)",Válber Mendes Ferreira or simply Válber (born ...,115,2,383,54,5,"{'2023-10-24': 0, '2023-10-25': 0, '2023-10-26...",0.222222,1
1,Agogo Presbyterian College of Education,Agogo Presbyterian College of Education is a t...,492,1,2538,383,1,"{'2023-10-24': 3, '2023-10-25': 2, '2023-10-26...",3.000000,18
2,Macara pasaleuca,Macara pasaleuca is a moth of the family Megal...,142,0,142,25,3,"{'2023-10-24': None, '2023-10-25': 0, '2023-10...",0.111111,2
3,Majhariya,Majhariya is a village and Village Development...,230,2,303,49,7,"{'2023-10-24': None, '2023-10-25': 0, '2023-10...",0.475000,3
4,India national football team records and stati...,The following is a list of the India national ...,1861,7,3629,577,1,"{'2023-10-24': 60, '2023-10-25': 66, '2023-10-...",99.683333,593
...,...,...,...,...,...,...,...,...,...,...
493,North Bengkulu Regency,North Bengkulu is a regency (Indonesian: kabup...,753,3,2073,328,12,"{'2023-10-24': 4, '2023-10-25': 6, '2023-10-26...",3.000000,11
494,Paghtasar Dpir,"Paghtasar Dpir, or Baghdasar Tbir (Armenian: Պ...",284,5,5547,814,11,"{'2023-10-24': 1, '2023-10-25': 0, '2023-10-26...",1.866667,10
495,Missile to the Moon,Missile to the Moon is a 1958 independently ma...,822,7,5204,861,6,"{'2023-10-24': 17, '2023-10-25': 29, '2023-10-...",34.933333,170
496,Hifumi,Hifumi (written: 一二三) is a unisex Japanese giv...,508,1,936,150,1,"{'2023-10-24': 15, '2023-10-25': 7, '2023-10-2...",8.983333,25


In [47]:
wiki_data.sample(20)

Unnamed: 0,Title,Sumary,Sumary Length,Sections Length,Text Length,Word Count,Language Count,Pageviews,Average Pageviews,Max Pageviews
283,Chester Bourne,"Chester Bourne (born 1889, date of death unkno...",152,3,263,39,1,"{'2023-10-24': 0, '2023-10-25': None, '2023-10...",0.3,2
26,One Step Beyond...,One Step Beyond . . . is the debut studio albu...,692,9,5578,929,13,"{'2023-10-24': 151, '2023-10-25': 139, '2023-1...",202.2,401
100,BIOS Scientific Publishers,BIOS Scientific Publishers was an English publ...,297,0,297,43,1,"{'2023-10-24': 0, '2023-10-25': 2, '2023-10-26...",0.65,3
372,Abdul Bari Sarkar (politician),Abdul Bari Sarkar (Bengali: আব্দুল বারী সরদার)...,140,1,267,40,2,"{'2023-10-24': None, '2023-10-25': None, '2023...",0.212121,1
118,Keely Moy,"Keely Moy (born April 23, 1998) is a Swiss-Ame...",113,2,874,149,2,"{'2023-10-24': 2, '2023-10-25': 2, '2023-10-26...",2.258621,8
463,Leroy Kamau,Leroy Kamau (born 2 February 1999) is a Papua ...,267,0,267,45,2,"{'2023-10-24': 0, '2023-10-25': 1, '2023-10-26...",1.618182,8
122,Randy Jayne,"Edward Randolph ""Randy"" Jayne II (born 1944) i...",877,6,11091,1759,2,"{'2023-10-24': 1, '2023-10-25': 2, '2023-10-26...",1.518519,7
9,Sudoku graph,"In the mathematics of Sudoku, the Sudoku graph...",363,4,5601,566,4,"{'2023-10-24': 5, '2023-10-25': 7, '2023-10-26...",10.866667,22
261,John Silver (wrestler),"John Anthony Silver (born June 4, 1990) is an ...",436,5,17841,3036,3,"{'2023-10-24': 156, '2023-10-25': 142, '2023-1...",108.3,414
314,Elhuyar,Elhuyar is a surname. Notable people with the ...,314,0,314,48,2,"{'2023-10-24': 0, '2023-10-25': None, '2023-10...",0.475,3


In [27]:
value

{'2023-10-24': 0,
 '2023-10-25': 0,
 '2023-10-26': 0,
 '2023-10-27': None,
 '2023-10-28': 0,
 '2023-10-29': 0,
 '2023-10-30': 1,
 '2023-10-31': None,
 '2023-11-01': 0,
 '2023-11-02': None,
 '2023-11-03': 0,
 '2023-11-04': 1,
 '2023-11-05': None,
 '2023-11-06': 0,
 '2023-11-07': 0,
 '2023-11-08': 0,
 '2023-11-09': 0,
 '2023-11-10': None,
 '2023-11-11': 1,
 '2023-11-12': 0,
 '2023-11-13': None,
 '2023-11-14': 0,
 '2023-11-15': 0,
 '2023-11-16': 0,
 '2023-11-17': 1,
 '2023-11-18': 1,
 '2023-11-19': 0,
 '2023-11-20': 1,
 '2023-11-21': None,
 '2023-11-22': 0,
 '2023-11-23': 0,
 '2023-11-24': 0,
 '2023-11-25': 0,
 '2023-11-26': 0,
 '2023-11-27': None,
 '2023-11-28': None,
 '2023-11-29': 0,
 '2023-11-30': 0,
 '2023-12-01': None,
 '2023-12-02': None,
 '2023-12-03': 1,
 '2023-12-04': 0,
 '2023-12-05': 1,
 '2023-12-06': 0,
 '2023-12-07': None,
 '2023-12-08': 0,
 '2023-12-09': None,
 '2023-12-10': 0,
 '2023-12-11': 0,
 '2023-12-12': 0,
 '2023-12-13': 0,
 '2023-12-14': 0,
 '2023-12-15': 0,
 '2023-

In [21]:
type(test['2023-10-24'])

int

In [4]:
# page view and time
# page view and language count