<a href="https://colab.research.google.com/github/Chiarmaka/Web-Scraping/blob/main/SportsData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install requests beautifulsoup4 pandas




In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


Using Web Scraping (BeautifulSoup + Requests) with requests to scrape the data directly from the web page.

In [None]:

# URL of the page
url = "https://worldathletics.org/records/toplists/sprints/100-metres/outdoor/women/senior/2023?regionType=countries&region=ngr&timing=electronic&windReading=regular&page=1&bestResultsOnly=false"

# Send request to the page
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table with the records (this may need adjustment based on the page structure)
    table = soup.find('table')

    # Extract headers
    headers = [header.get_text() for header in table.find_all('th')]

    # Extract rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cells = row.find_all('td')
        row_data = [cell.get_text(strip=True) for cell in cells]
        rows.append(row_data)

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Save the DataFrame to a CSV file
    df.to_csv('SportsData.csv', index=False)
    print("Data has been saved to SportsData.csv.")
else:
    print("Failed to retrieve the data.")


Data has been saved to SportsData.csv.


In [None]:
SportsData = pd.read_csv("SportsData.csv")
SportsData.head()

Unnamed: 0,\n Rank\n,\n Mark\n,\n WIND\n,\n Competitor\n,\n DOB\n,\n Nat\n,\n Pos\n,Unnamed: 7,\n Venue\n,\n Date\n,\n Results Score\n
0,1,11.01,0.1,Rosemary CHUKWUMA,05 DEC 2001,NGR,1sf2,,"Mike A. Myers Stadium, Austin, TX (USA)",08 JUN 2023,1198
1,2,11.09,1.0,Rosemary CHUKWUMA,05 DEC 2001,NGR,2h3,,"Hornet Stadium - Sac St., Sacramento, CA (USA)",27 MAY 2023,1180
2,3,11.1,1.6,Tobi AMUSAN,23 APR 1997,NGR,3f2,,"Percy Beard Track, Gainesville, FL (USA)",15 APR 2023,1178
3,4,11.17,0.9,Favour OFILI,31 DEC 2002,NGR,5,,"LSU Bernie Moore Stadium, Baton Rouge, LA (USA)",13 MAY 2023,1163
4,5,11.19,1.4,Rosemary CHUKWUMA,05 DEC 2001,NGR,1pr2,,"Hornet Stadium - Sac St., Sacramento, CA (USA)",25 MAY 2023,1159


In [None]:
SportsData.columns

Index(['\n                                        Rank\n                                    ',
       '\n                                        Mark\n                                    ',
       '\n                                            WIND\n                                        ',
       '\n                                        Competitor\n                                    ',
       '\n                                        DOB\n                                    ',
       '\n                                        Nat\n                                    ',
       '\n                                        Pos\n                                    ',
       'Unnamed: 7',
       '\n                                        Venue\n                                    ',
       '\n                                        Date\n                                    ',
       '\n                                            Results Score\n                                        '],

Clean the column names by removing newlines, extra spaces, and replacing spaces with underscores

In [None]:

SportsData.columns = SportsData.columns.str.strip().str.replace('\n', '').str.replace(' ', '_').str.lower()

# Display the cleaned column names
print("Cleaned Column Headers:")
print(SportsData.columns)

# Display the first few rows to verify the changes
SportsData.head()


Cleaned Column Headers:
Index(['rank', 'mark', 'wind', 'competitor', 'dob', 'nat', 'pos', 'unnamed:_7',
       'venue', 'date', 'results_score'],
      dtype='object')


Unnamed: 0,rank,mark,wind,competitor,dob,nat,pos,unnamed:_7,venue,date,results_score
0,1,11.01,0.1,Rosemary CHUKWUMA,05 DEC 2001,NGR,1sf2,,"Mike A. Myers Stadium, Austin, TX (USA)",08 JUN 2023,1198
1,2,11.09,1.0,Rosemary CHUKWUMA,05 DEC 2001,NGR,2h3,,"Hornet Stadium - Sac St., Sacramento, CA (USA)",27 MAY 2023,1180
2,3,11.1,1.6,Tobi AMUSAN,23 APR 1997,NGR,3f2,,"Percy Beard Track, Gainesville, FL (USA)",15 APR 2023,1178
3,4,11.17,0.9,Favour OFILI,31 DEC 2002,NGR,5,,"LSU Bernie Moore Stadium, Baton Rouge, LA (USA)",13 MAY 2023,1163
4,5,11.19,1.4,Rosemary CHUKWUMA,05 DEC 2001,NGR,1pr2,,"Hornet Stadium - Sac St., Sacramento, CA (USA)",25 MAY 2023,1159


Part A - Create a table showing the Top 100 Nigerian Athlete Performances in 2023, from the highest to the lowest World Athletics 'Results Scores', across the following 10 Track & Field Events combined - women's & men's 100m, 200m, 400m, 100/110mH & 400mH

In [None]:

# Convert 'Results Score' column to numeric
SportsData['results_score'] = pd.to_numeric(SportsData['results_score'], errors='coerce')


# Filter for Nigerian athletes (nat = 'NGR') and year 2023
top_100_performances = SportsData[(SportsData['nat'] == 'NGR') & (SportsData['date'].str.contains('2023'))]

# Sort by 'Results Score' in descending order and get the top 100
top_100_performances = top_100_performances.sort_values(by='results_score', ascending=False).head(100)

top_100_performances['combined_rankings'] = top_100_performances['results_score'].rank(method='min', ascending=False).astype(int).head(100)

# Display the Top 100 Performances
top_100_performances[[ 'results_score','rank', 'combined_rankings', 'mark', 'wind', 'competitor', 'dob', 'nat', 'pos', 'venue', 'date']]


Unnamed: 0,results_score,rank,combined_rankings,mark,wind,competitor,dob,nat,pos,venue,date
0,1198,1,1,11.01,0.1,Rosemary CHUKWUMA,05 DEC 2001,NGR,1sf2,"Mike A. Myers Stadium, Austin, TX (USA)",08 JUN 2023
1,1180,2,2,11.09,1.0,Rosemary CHUKWUMA,05 DEC 2001,NGR,2h3,"Hornet Stadium - Sac St., Sacramento, CA (USA)",27 MAY 2023
2,1178,3,3,11.10,1.6,Tobi AMUSAN,23 APR 1997,NGR,3f2,"Percy Beard Track, Gainesville, FL (USA)",15 APR 2023
3,1163,4,4,11.17,0.9,Favour OFILI,31 DEC 2002,NGR,5,"LSU Bernie Moore Stadium, Baton Rouge, LA (USA)",13 MAY 2023
4,1159,5,5,11.19,1.4,Rosemary CHUKWUMA,05 DEC 2001,NGR,1pr2,"Hornet Stadium - Sac St., Sacramento, CA (USA)",25 MAY 2023
...,...,...,...,...,...,...,...,...,...,...,...
93,1044,94,96,11.74,0.6,Blessing OGUNDIRAN,11 NOV 1999,NGR,1sf2,"Samuel Ogbemudia Stadium, Benin City (NGR)",15 JUN 2023
94,1044,94,96,11.74,0.0,Balikis YAKUBU,11 JUL 1996,NGR,3f1,"Stadio Comunale, Pergine Valsugana (ITA)",17 JUN 2023
96,1044,94,96,11.74,1.7,Praise OFOKU,15 JUN 2003,NGR,3sf3,"Yabatech Sport Complex, Lagos (NGR)",14 JUL 2023
98,1042,98,99,11.75,0.0,Osamuyi FAITH,12 DEC 2007,NGR,2h5,"Samuel Ogbemudia Stadium, Benin City (NGR)",05 JUL 2023


Part B - Create a table showing the Top 50 Nigerian Athlete Performers in 2023, from the highest to the lowest World Athletics 'Results Scores', across the following 10 Track & Field Events combined - women's & men's 100m, 200m, 400m, 100/110mH & 400mH

In [None]:
top_50_performers = SportsData[(SportsData['nat'] == 'NGR') & (SportsData['date'].str.contains('2023'))]

# Best performance for each athlete (unique)
top_50_performers = top_50_performers.loc[top_50_performers.groupby('competitor')['results_score'].idxmax()]

# Calculate best_score and gender
top_50_performers = top_50_performers.assign(
    best_score=top_50_performers['results_score']
)


# Sort by the best performance score and get the top 50
top_50_performers = top_50_performers.sort_values('best_score', ascending=False).head(50)

# Add Combined Rankings
top_50_performers['combined_rankings'] = top_50_performers['best_score'].rank(method='min', ascending=False).astype(int)

# Display the Top 50 Performers
top_50_performers[['competitor', 'best_score', 'combined_rankings', 'nat']]

Unnamed: 0,competitor,best_score,combined_rankings,nat
0,Rosemary CHUKWUMA,1198,1,NGR
2,Tobi AMUSAN,1178,2,NGR
3,Favour OFILI,1163,3,NGR
8,Faith OKWOSE,1144,4,NGR
11,Victory GODAH,1139,5,NGR
13,Justina Tiana EYAKPOBEYAN,1137,6,NGR
12,Success UMUKORO,1137,6,NGR
22,Tima GODBLESS,1113,8,NGR
20,Blessing OGUNDIRAN,1112,9,NGR
23,Victory OWHOVORIOLE,1104,10,NGR
