In [11]:
import pandas as pd
import requests
import time

# Load only pageid column from your dataset
df = pd.read_csv(r"C:\Users\nafis\Downloads\wikipedia_articles_cleaned.csv", usecols=['pageid'])

print(df.head())




     pageid
0  18955875
1    682482
2     24544
3     32927
4    325329


In [13]:
# Function to fetch titles for page IDs
def get_titles_from_ids(page_ids):
    titles = {}
    for i in range(0, len(page_ids), 50):  # API allows up to 50 IDs per request
        batch = page_ids[i:i+50]
        ids_str = "|".join(str(pid) for pid in batch)
        url = f"https://en.wikipedia.org/w/api.php?action=query&pageids={ids_str}&format=json"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for page in data['query']['pages'].values():
                titles[page['pageid']] = page['title']
        else:
            print(f"Error: {response.status_code} for IDs {ids_str}")
        time.sleep(0.2)  # Avoid API rate limits
    return titles

In [15]:
# Get titles for all page IDs
page_ids = df['pageid'].tolist()
titles_dict = get_titles_from_ids(page_ids)

# Create a new DataFrame with pageid and title only
titles_df = pd.DataFrame(list(titles_dict.items()), columns=['pageid', 'title'])

# Preview the result
print(titles_df.head())

# Save to CSV (optional)
titles_df.to_csv(r"C:\Users\nafis\Downloads\pageid_title.csv", index=False)

   pageid              title
0     655             Abacus
1    1271  Analytical engine
2    1372              Amber
3    2027       Andrew Wiles
4    5180          Chemistry


In [21]:
import pandas as pd
import requests
import time
from urllib.parse import quote
# Load your pageid-title file
titles_df = pd.read_csv(r"C:\Users\nafis\Downloads\pageid_title.csv")

# Function to get total pageviews for a title
def get_pageviews(title):
    try:
        # URL-encode the title for API
        title_url = quote(title.replace(" ", "_"))
        url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{title_url}/monthly/20230101/20231231"
        headers = {
            "User-Agent": "WikipediaQualityBot/1.0 (your_email@example.com)"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            total_views = sum(item['views'] for item in data['items'])
            return total_views
        else:
            print(f"Failed for {title}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error for {title}: {e}")
        return None



In [23]:
# Limit to first 100 articles for testing
titles_sample = titles_df.head(100)

print("⏳ Collecting pageviews for first 100 articles...")
titles_sample['pageviews'] = titles_sample['title'].apply(get_pageviews)

# Save test result to a new CSV
titles_sample.to_csv(r"C:\Users\nafis\Downloads\pageid_title_sample.csv", index=False)
print("✅ Pageviews for first 100 articles saved to pageid_title_sample.csv")


⏳ Collecting pageviews for first 100 articles...
Failed for Amy Roth McDuffie: 404
Failed for AN/FSQ-7 Combat Direction Central: 404
✅ Pageviews for first 100 articles saved to pageid_title_sample.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles_sample['pageviews'] = titles_sample['title'].apply(get_pageviews)


In [35]:
import pandas as pd
import requests
import time
from urllib.parse import quote

# Load the sample CSV with first 100 articles
titles_sample = pd.read_csv(r"C:\Users\nafis\Downloads\pageid_title_sample.csv")


In [37]:
# Function to get number of edits
def get_edit_count(title):
    try:
        title_url = quote(title.replace(" ", "_"))
        url = f"https://en.wikipedia.org/w/api.php?action=query&titles={title_url}&prop=revisions&rvprop=timestamp&rvlimit=500&format=json"
        headers = {
            "User-Agent": "WikipediaQualityBot/1.0 (your_email@example.com)"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            pages = data['query']['pages']
            page = next(iter(pages.values()))
            revisions = page.get('revisions', [])
            return len(revisions)  # Number of revisions found (max 500)
        else:
            print(f"Failed for {title}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error for {title}: {e}")
        return None

In [39]:
# Function to get number of unique editors
def get_num_editors(title):
    try:
        title_url = quote(title.replace(" ", "_"))
        url = (f"https://en.wikipedia.org/w/api.php?action=query"
               f"&titles={title_url}&prop=revisions&rvprop=user"
               f"&rvlimit=500&format=json")
        headers = {
            "User-Agent": "WikipediaQualityBot/1.0 (your_email@example.com)"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            pages = data['query']['pages']
            page = next(iter(pages.values()))
            revisions = page.get('revisions', [])
            editors = {rev['user'] for rev in revisions}  # Unique editors
            return len(editors)
        else:
            print(f"Failed for {title}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error for {title}: {e}")
        return None


In [43]:
# Function to get 3 most recent edit timestamps
def get_last_3_edits(title):
    try:
        title_url = quote(title.replace(" ", "_"))
        url = (f"https://en.wikipedia.org/w/api.php?action=query"
               f"&titles={title_url}&prop=revisions&rvprop=timestamp"
               f"&rvlimit=3&rvdir=older&format=json")
        headers = {
            "User-Agent": "WikipediaQualityBot/1.0 (your_email@example.com)"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            pages = data['query']['pages']
            page = next(iter(pages.values()))
            revisions = page.get('revisions', [])
            timestamps = [rev['timestamp'] for rev in revisions]
            return timestamps  # List of up to 3 timestamps
        else:
            print(f"Failed for {title}: {response.status_code}")
            return [None, None, None]
    except Exception as e:
        print(f"Error for {title}: {e}")
        return [None, None, None]


In [45]:
# 🏃‍♀️ Apply all functions to first 100 articles
print("⏳ Collecting edit counts...")
titles_sample['edit_count'] = titles_sample['title'].apply(get_edit_count)

print("⏳ Collecting last 3 edit timestamps...")
titles_sample[['edit_1', 'edit_2', 'edit_3']] = pd.DataFrame(
    titles_sample['title'].apply(get_last_3_edits).tolist(),
    index=titles_sample.index
)

print("⏳ Collecting number of unique editors...")
titles_sample['num_editors'] = titles_sample['title'].apply(get_num_editors)

# ✅ Save all results at once
output_path = r"C:\Users\nafis\Downloads\pageid_title_sample.csv"
titles_sample.to_csv(output_path, index=False)
print(f"✅ All features saved to {output_path}")

⏳ Collecting edit counts...
⏳ Collecting last 3 edit timestamps...
⏳ Collecting number of unique editors...
✅ All features saved to C:\Users\nafis\Downloads\pageid_title_sample.csv
