# Notebook used to scrap marathon data.

# 1. Libraries

In [1]:
import pandas as pd
from marathons import *
from marathons_scrapy.marathons_scrapy.spiders import *
from utils.scrap_utils import get_settings, run_spider, expand_splits

# 2. Marathons

In [2]:
YEAR_13: str = "2013"
YEAR_14: str = "2014"
YEAR_15: str = "2015"
YEAR_16: str = "2016"
YEAR_17: str = "2017"
YEAR_18: str = "2018"
YEAR_19: str = "2019"
YEAR_20: str = "2020"
YEAR_21: str = "2021"
YEAR_22: str = "2022"
YEAR_23: str = "2023"
SPLITS_KEYS = ["k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]

## London

In [3]:
LDN_NAME = "London"

# London URLs Templates.
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page.
LONDON_MARATHON_URL: str = "https://results.tcslondonmarathon.com/{0}/?page={1}&event=MAS&pid=search&sex={2}&num_results={3}"
# {0}: Year || {1}: runner id
LONDON_MARATHON_SPLIT_URL: str = "https://results.tcslondonmarathon.com/{0}/?content=detail&fpid=search&pid=search&idp={1}&lang=EN_CAP&event=MAS&&search_event=MAS"

LDN_NUM_RESULTS: str = "1000"

LDN_RES_FIELDS: list[str] = ["run_no", "age_cat", "gender", "half", "finish", "idp"]
LDN_SPLITS_FIELDS: list[str] = ["idp", "race_state", "last_split", "k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]


In [4]:
# Initialising london marathon object. 
london = LondonMarathon(url_template=LONDON_MARATHON_URL, split_url_template=LONDON_MARATHON_SPLIT_URL)

### 2014

In [6]:
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_14}"

#### Result Pages

In [7]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_14, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 27 || Women Pages: 16
London 2014 total results pages: 43
Example URLs: 
 https://results.tcslondonmarathon.com/2014/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2014/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2014/London2014_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1418, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 0bc4cb8abbc723a9
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (41677 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2014/London2014_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 12452,
 'downloader/request_count': 43,
 'downloader/request_method_count/GET': 43,
 'downloader/response_bytes': 2820454,
 'downloader/response_count': 43,
 'downloader/response_status_count/200': 43,
 'elapsed_time_seconds': 16.92034,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 7, 30, 106423),
 'httpcompression/response_bytes': 31540784,
 'httpcompression/response_count': 43,
 'item_scraped_count': 41677,
 'log_count/INFO': 12,
 'memusage/max': 171622400,
 'memusage/startup': 171606016,
 'response_received_count': 43,
 'scheduler/dequeued': 43,
 'scheduler/dequeued/memory': 43,
 'scheduler/enqueued': 43,
 'scheduler/enqueued/memory': 43,
 'start

In [11]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2014_res.csv")

In [12]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,46087,18-39,M,01:52:04,04:47:43,9999990F5ECC830000140EC5
1,52210,18-39,M,01:44:26,03:44:49,9999990F5ECC83000014236E
2,5978,40-44,M,02:09:09,04:35:19,9999990F5ECC83000013C2DD
3,54639,40-44,M,01:35:42,03:25:42,9999990F5ECC830000143033
4,52269,18-39,M,,04:45:46,9999990F5ECC830000142398


In [13]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41677 entries, 0 to 41676
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   41677 non-null  int64 
 1   age_cat  41662 non-null  object
 2   gender   41677 non-null  object
 3   half     36068 non-null  object
 4   finish   35881 non-null  object
 5   idp      41677 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.9+ MB


#### Splits Pages

In [9]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2014_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_14, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2014 total splits pages: 41677
Example URLs: 
 https://results.tcslondonmarathon.com/2014/?content=detail&fpid=search&pid=search&idp=9999990F5ECC830000140EC5&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2014/?content=detail&fpid=search&pid=search&idp=9999990F5ECC83000013E5F4&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2014/London2014_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1418, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 1cdb801c566119c6


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9050 pages (at 9050 pages/min), scraped 9034 items (at 9034 items/min)
INFO: Crawled 17949 pages (at 8899 pages/min), scraped 17929 items (at 8895 items/min)
INFO: Crawled 27117 pages (at 9168 pages/min), scraped 27098 items (at 9169 items/min)
INFO: Crawled 36030 pages (at 8913 pages/min), scraped 36013 items (at 8915 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (41677 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2014/London2014_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 14378565,
 'downloader/request_count': 41677,
 'downloader/request_method_count/GET': 41677,
 'downloader/response_bytes': 241825795,
 'downloader/response_count': 41677,
 'downloader/response_status_count/200': 41677,
 'elapsed_time_seconds': 278.320045,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 12, 29

In [11]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2014_splits.csv")

In [12]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41677 entries, 0 to 41676
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         41677 non-null  object
 1   race_state  41677 non-null  object
 2   last_split  36289 non-null  object
 3   k_5         41677 non-null  object
 4   k_10        41677 non-null  object
 5   k_15        41677 non-null  object
 6   k_20        41677 non-null  object
 7   k_half      41677 non-null  object
 8   k_25        41677 non-null  object
 9   k_30        41677 non-null  object
 10  k_35        41677 non-null  object
 11  k_40        41677 non-null  object
 12  k_finish    41677 non-null  object
dtypes: object(13)
memory usage: 4.1+ MB


#### Full Dataset for London 2014.

In [14]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2014_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2014_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [15]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41677 entries, 0 to 41676
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      41677 non-null  int64 
 1   age_cat     41662 non-null  object
 2   gender      41677 non-null  object
 3   half        36068 non-null  object
 4   finish      35881 non-null  object
 5   idp         41677 non-null  object
 6   race_state  41677 non-null  object
 7   last_split  36289 non-null  object
 8   k_5         41677 non-null  object
 9   k_10        41677 non-null  object
 10  k_15        41677 non-null  object
 11  k_20        41677 non-null  object
 12  k_half      41677 non-null  object
 13  k_25        41677 non-null  object
 14  k_30        41677 non-null  object
 15  k_35        41677 non-null  object
 16  k_40        41677 non-null  object
 17  k_finish    41677 non-null  object
dtypes: int64(1), object(17)
memory usage: 5.7+ MB


In [16]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [17]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41677 entries, 0 to 41676
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          41677 non-null  int64 
 1   age_cat         41662 non-null  object
 2   gender          41677 non-null  object
 3   half            36068 non-null  object
 4   finish          35881 non-null  object
 5   idp             41677 non-null  object
 6   race_state      41677 non-null  object
 7   last_split      36289 non-null  object
 8   k_5_time        41677 non-null  object
 9   k_5_pace        41677 non-null  object
 10  k_5_speed       41677 non-null  object
 11  k_10_time       41677 non-null  object
 12  k_10_pace       41677 non-null  object
 13  k_10_speed      41677 non-null  object
 14  k_15_time       41677 non-null  object
 15  k_15_pace       41677 non-null  object
 16  k_15_speed      41677 non-null  object
 17  k_20_time       41677 non-null  object
 18  k_20_p

In [18]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,151,18-39,W,01:29:19,02:58:22,9999990F5ECC83000013B29D,Finished,Finish time,00:20:59,04:12,...,14.05,02:28:34,04:17,14.05,02:49:27,04:11,14.37,02:58:22,04:04,14.77
1,152,18-39,W,01:42:29,03:35:33,9999990F5ECC83000013B29E,Finished,Finish time,00:23:20,04:40,...,11.55,02:54:59,05:25,11.08,03:23:33,05:43,10.50,03:35:33,05:29,10.97
2,154,40-44,W,,,9999990F5ECC83000013B29F,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
3,155,18-39,W,01:34:27,03:19:11,9999990F5ECC83000013B2A0,Finished,Finish time,00:21:31,04:19,...,12.28,02:42:12,04:59,12.06,03:07:53,05:09,11.68,03:19:11,05:09,11.65
4,157,18-39,W,01:29:54,03:21:24,9999990F5ECC83000013B2A1,Finished,Finish time,00:20:24,04:05,...,11.87,02:41:42,05:44,10.47,03:10:37,05:47,10.37,03:21:24,04:55,12.21


In [22]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2014_full.csv", index=False)

In [19]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections.
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2015

In [21]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_15}"

#### Result Pages

In [22]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_15, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 27 || Women Pages: 17
London 2015 total results pages: 44
Example URLs: 
 https://results.tcslondonmarathon.com/2015/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2015/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2015/London2015_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1418, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 50fd40e1f0950be5
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (43741 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2015/London2015_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 12742,
 'downloader/request_count': 44,
 'downloader/request_method_count/GET': 44,
 'downloader/response_bytes': 3212920,
 'downloader/response_count': 44,
 'downloader/response_status_count/200': 44,
 'elapsed_time_seconds': 14.457524,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 11, 18, 3, 4, 173619),
 'httpcompression/response_bytes': 33710498,
 'httpcompression/response_count': 44,
 'item_scraped_count': 43741,
 'log_count/INFO': 12,
 'memusage/max': 179437568,
 'memusage/startup': 179437568,
 'response_received_count': 44,
 'scheduler/dequeued': 44,
 'scheduler/dequeued/memory': 44,
 'scheduler/enqueued': 44,
 'scheduler/enqueued/memory': 44,
 'start

In [10]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2015_res.csv")

In [11]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43741 entries, 0 to 43740
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   43741 non-null  int64 
 1   age_cat  43715 non-null  object
 2   gender   43741 non-null  object
 3   half     37611 non-null  object
 4   finish   37599 non-null  object
 5   idp      43741 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.0+ MB


In [12]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,38514,45-49,M,02:06:57,04:10:37,9999990F5ECC830000170F2D
1,50190,40-44,M,,,9999990F5ECC83000016C338
2,18920,18-39,M,01:45:49,03:40:50,9999990F5ECC8300001689EC
3,6749,18-39,M,02:20:10,05:32:20,9999990F5ECC83000016A429
4,31341,50-54,M,01:39:31,03:28:17,9999990F5ECC83000017168A


#### Splits Pages

In [24]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2015_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_15, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2015 total splits pages: 43741
Example URLs: 
 https://results.tcslondonmarathon.com/2015/?content=detail&fpid=search&pid=search&idp=9999990F5ECC830000170F2D&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2015/?content=detail&fpid=search&pid=search&idp=9999990F5ECC83000016EF40&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2015/London2015_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1418, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 92c49aed385964f4
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9831 pages (at 9831 pages/min), scraped 9822 items (at 9822 items/min)
INFO: Crawled 19742 pages (at 9911 pages/min), scraped 19732 items (at 9910 items/min)
INFO: Crawled 29649 pages (at 9907 pages/min), scraped 29638 items (at 9906 items/min)
INFO: Crawled 39492 pages (at 9843 pages/min), scraped 39481 items (at 9843 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (43741 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2015/London2015_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15090645,
 'downloader/request_count': 43741,
 'downloader/request_method_count/GET': 43741,
 'downloader/response_bytes': 257338529,
 'downloader/response_count': 43741,
 'downloader/response_status_count/200': 43741,
 'elapsed_time_seconds': 265.300451,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 11, 18, 11, 25

In [15]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2015_splits.csv")

In [16]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43741 entries, 0 to 43740
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         43741 non-null  object
 1   race_state  43741 non-null  object
 2   last_split  37881 non-null  object
 3   k_5         43739 non-null  object
 4   k_10        43739 non-null  object
 5   k_15        43739 non-null  object
 6   k_20        43739 non-null  object
 7   k_half      43739 non-null  object
 8   k_25        43739 non-null  object
 9   k_30        43739 non-null  object
 10  k_35        43739 non-null  object
 11  k_40        43739 non-null  object
 12  k_finish    43739 non-null  object
dtypes: object(13)
memory usage: 4.3+ MB


#### Full Dataset for London 2015

In [25]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2015_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2015_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [26]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43741 entries, 0 to 43740
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      43741 non-null  int64 
 1   age_cat     43715 non-null  object
 2   gender      43741 non-null  object
 3   half        37611 non-null  object
 4   finish      37599 non-null  object
 5   idp         43741 non-null  object
 6   race_state  43741 non-null  object
 7   last_split  37881 non-null  object
 8   k_5         43739 non-null  object
 9   k_10        43739 non-null  object
 10  k_15        43739 non-null  object
 11  k_20        43739 non-null  object
 12  k_half      43739 non-null  object
 13  k_25        43739 non-null  object
 14  k_30        43739 non-null  object
 15  k_35        43739 non-null  object
 16  k_40        43739 non-null  object
 17  k_finish    43739 non-null  object
dtypes: int64(1), object(17)
memory usage: 6.0+ MB


In [27]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [28]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43741 entries, 0 to 43740
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          43741 non-null  int64 
 1   age_cat         43715 non-null  object
 2   gender          43741 non-null  object
 3   half            37611 non-null  object
 4   finish          37599 non-null  object
 5   idp             43741 non-null  object
 6   race_state      43741 non-null  object
 7   last_split      37881 non-null  object
 8   k_5_time        43739 non-null  object
 9   k_5_pace        43739 non-null  object
 10  k_5_speed       43739 non-null  object
 11  k_10_time       43739 non-null  object
 12  k_10_pace       43739 non-null  object
 13  k_10_speed      43739 non-null  object
 14  k_15_time       43739 non-null  object
 15  k_15_pace       43739 non-null  object
 16  k_15_speed      43739 non-null  object
 17  k_20_time       43739 non-null  object
 18  k_20_p

In [29]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,151,40-44,W,,,9999990F5ECC8300001681EC,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
1,153,18-39,W,,,9999990F5ECC8300001681ED,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
2,155,18-39,W,01:33:35,03:27:26,9999990F5ECC8300001681EE,Finished,FINISH,00:20:27,04:06,...,11.70,02:44:58,05:15,11.44,03:15:12,06:03,9.92,03:27:26,05:35,10.77
3,156,40-44,W,01:27:46,02:58:46,9999990F5ECC83000016942A,Finished,FINISH,00:20:39,04:08,...,13.98,02:27:28,04:21,13.82,02:49:07,04:20,13.86,02:58:46,04:24,13.65
4,157,18-39,W,01:33:04,03:09:58,9999990F5ECC8300001681EF,Finished,FINISH,00:22:07,04:26,...,13.45,02:35:26,04:35,13.13,02:59:08,04:45,12.66,03:09:58,04:57,12.16


In [22]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2015_full.csv", index=False)

In [30]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections.
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2016

In [31]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_16}"

#### Result Pages

In [32]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_16, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)


Men Pages: 28 || Women Pages: 18
London 2016 total results pages: 46
Example URLs: 
 https://results.tcslondonmarathon.com/2016/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2016/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2016/London2016_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1418, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: a4f80d24b35800ee


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (45202 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2016/London2016_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13322,
 'downloader/request_count': 46,
 'downloader/request_method_count/GET': 46,
 'downloader/response_bytes': 3330566,
 'downloader/response_count': 46,
 'downloader/response_status_count/200': 46,
 'elapsed_time_seconds': 12.43062,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 11, 18, 28, 16, 464385),
 'httpcompression/response_bytes': 34853044,
 'httpcompression/response_count': 46,
 'item_scraped_count': 45202,
 'log_count/INFO': 12,
 'memusage/max': 395362304,
 'memusage/startup': 395362304,
 'response_received_count': 46,
 'scheduler/dequeued': 46,
 'scheduler/dequeued/memory': 46,
 'scheduler/enqueued': 46,
 'scheduler/enqueued/memory': 46,
 'star

In [28]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2016_res.csv")

In [29]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,14805,45-49,M,02:00:30,04:15:34,9999990F5ECC83000019C9C9
1,29306,45-49,M,01:35:33,03:24:38,9999990F5ECC8300001A5E02
2,11130,50-54,M,02:49:23,06:36:46,9999990F5ECC83000019DFDF
3,40722,45-49,M,01:44:33,03:36:30,9999990F5ECC8300001A27C3
4,32653,18-39,M,02:04:25,04:29:51,9999990F5ECC8300001A2CC6


In [30]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45202 entries, 0 to 45201
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   45202 non-null  int64 
 1   age_cat  45201 non-null  object
 2   gender   45202 non-null  object
 3   half     39040 non-null  object
 4   finish   38939 non-null  object
 5   idp      45202 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.1+ MB


#### Splits Pages

In [33]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2016_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_16, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2016 total splits pages: 45202
Example URLs: 
 https://results.tcslondonmarathon.com/2016/?content=detail&fpid=search&pid=search&idp=9999990F5ECC83000019C9C9&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2016/?content=detail&fpid=search&pid=search&idp=9999990F5ECC8300001A3209&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2016/London2016_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1418, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 846fd591a36d8b6e
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9274 pages (at 9274 pages/min), scraped 9253 items (at 9253 items/min)
INFO: Crawled 18441 pages (at 9167 pages/min), scraped 18426 items (at 9173 items/min)
INFO: Crawled 27623 pages (at 9182 pages/min), scraped 27609 items (at 9183 items/min)
INFO: Crawled 36992 pages (at 9369 pages/min), scraped 36977 items (at 9368 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (45202 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2016/London2016_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15594690,
 'downloader/request_count': 45202,
 'downloader/request_method_count/GET': 45202,
 'downloader/response_bytes': 265357508,
 'downloader/response_count': 45202,
 'downloader/response_status_count/200': 45202,
 'elapsed_time_seconds': 294.592761,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 11, 18, 35, 39

In [33]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2016_splits.csv")

In [34]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45202 entries, 0 to 45201
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         45202 non-null  object
 1   race_state  45202 non-null  object
 2   last_split  39217 non-null  object
 3   k_5         45202 non-null  object
 4   k_10        45202 non-null  object
 5   k_15        45202 non-null  object
 6   k_20        45202 non-null  object
 7   k_half      45202 non-null  object
 8   k_25        45202 non-null  object
 9   k_30        45202 non-null  object
 10  k_35        45202 non-null  object
 11  k_40        45202 non-null  object
 12  k_finish    45202 non-null  object
dtypes: object(13)
memory usage: 4.5+ MB


#### Full Dataset for London 2016

In [34]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2016_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2016_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [35]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45202 entries, 0 to 45201
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      45202 non-null  int64 
 1   age_cat     45201 non-null  object
 2   gender      45202 non-null  object
 3   half        39040 non-null  object
 4   finish      38939 non-null  object
 5   idp         45202 non-null  object
 6   race_state  45202 non-null  object
 7   last_split  39217 non-null  object
 8   k_5         45202 non-null  object
 9   k_10        45202 non-null  object
 10  k_15        45202 non-null  object
 11  k_20        45202 non-null  object
 12  k_half      45202 non-null  object
 13  k_25        45202 non-null  object
 14  k_30        45202 non-null  object
 15  k_35        45202 non-null  object
 16  k_40        45202 non-null  object
 17  k_finish    45202 non-null  object
dtypes: int64(1), object(17)
memory usage: 6.2+ MB


In [36]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [37]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45202 entries, 0 to 45201
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          45202 non-null  int64 
 1   age_cat         45201 non-null  object
 2   gender          45202 non-null  object
 3   half            39040 non-null  object
 4   finish          38939 non-null  object
 5   idp             45202 non-null  object
 6   race_state      45202 non-null  object
 7   last_split      39217 non-null  object
 8   k_5_time        45202 non-null  object
 9   k_5_pace        45202 non-null  object
 10  k_5_speed       45202 non-null  object
 11  k_10_time       45202 non-null  object
 12  k_10_pace       45202 non-null  object
 13  k_10_speed      45202 non-null  object
 14  k_15_time       45202 non-null  object
 15  k_15_pace       45202 non-null  object
 16  k_15_speed      45202 non-null  object
 17  k_20_time       45202 non-null  object
 18  k_20_p

In [38]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,151,18-39,W,01:26:22,03:04:15,9999990F5ECC83000019C3CC,Finished,FINISH,00:20:19,04:04,...,13.82,02:27:30,04:35,13.13,02:52:18,04:58,12.10,03:04:15,05:27,11.02
1,152,18-39,W,01:36:40,03:41:57,9999990F5ECC83000019C68C,Finished,FINISH,00:20:58,04:12,...,9.90,03:04:27,07:18,8.22,03:30:48,05:17,11.39,03:41:57,05:05,11.81
2,154,50-54,W,,,9999990F5ECC83000019C68D,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
3,155,45-49,W,,,9999990F5ECC83000019C3CD,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
4,156,18-39,W,01:56:54,03:57:05,9999990F5ECC83000019C68E,Finished,FINISH,00:27:22,05:29,...,10.24,03:17:11,05:53,10.20,03:46:13,05:49,10.33,03:57:05,04:58,12.12


In [40]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2016_full.csv", index=False)

In [39]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2017

In [40]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_17}"

#### Result Pages

In [41]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_17, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 28 || Women Pages: 18
London 2017 total results pages: 46
Example URLs: 
 https://results.tcslondonmarathon.com/2017/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2017/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2017/London2017_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1418, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: e8d72071c2d6b9a9
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (45155 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2017/London2017_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13322,
 'downloader/request_count': 46,
 'downloader/request_method_count/GET': 46,
 'downloader/response_bytes': 3316239,
 'downloader/response_count': 46,
 'downloader/response_status_count/200': 46,
 'elapsed_time_seconds': 19.678748,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 12, 21, 7, 50, 235647),
 'httpcompression/response_bytes': 34759300,
 'httpcompression/response_count': 46,
 'item_scraped_count': 45155,
 'log_count/INFO': 12,
 'memusage/max': 159891456,
 'memusage/startup': 159891456,
 'response_received_count': 46,
 'scheduler/dequeued': 46,
 'scheduler/dequeued/memory': 46,
 'scheduler/enqueued': 46,
 'scheduler/enqueued/memory': 46,
 'star

In [10]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2017_res.csv")

In [11]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,61775,50-54,M,01:37:43,03:25:32,9999990F5ECC850000252BEC
1,56388,45-49,M,01:58:47,04:07:39,9999990F5ECC850000251006
2,32459,18-39,M,01:28:06,02:59:49,9999990F5ECC85000024B196
3,11413,45-49,M,02:29:25,05:00:38,9999990F5ECC8500002499CF
4,22739,60-64,M,02:25:12,05:53:52,9999990F5ECC85000024C1FD


In [12]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45155 entries, 0 to 45154
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   45155 non-null  int64 
 1   age_cat  45154 non-null  object
 2   gender   45155 non-null  object
 3   half     39469 non-null  object
 4   finish   39282 non-null  object
 5   idp      45155 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.1+ MB


#### Splits Pages

In [42]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2017_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_17, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2017 total splits pages: 45155
Example URLs: 
 https://results.tcslondonmarathon.com/2017/?content=detail&fpid=search&pid=search&idp=9999990F5ECC850000252BEC&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2017/?content=detail&fpid=search&pid=search&idp=9999990F5ECC85000024FC69&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2017/London2017_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1418, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 53de8f99e34bb350


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9650 pages (at 9650 pages/min), scraped 9630 items (at 9630 items/min)
INFO: Crawled 19156 pages (at 9506 pages/min), scraped 19140 items (at 9510 items/min)
INFO: Crawled 28827 pages (at 9671 pages/min), scraped 28810 items (at 9670 items/min)
INFO: Crawled 38521 pages (at 9694 pages/min), scraped 38501 items (at 9691 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (45155 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2017/London2017_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15578475,
 'downloader/request_count': 45155,
 'downloader/request_method_count/GET': 45155,
 'downloader/response_bytes': 272735451,
 'downloader/response_count': 45155,
 'downloader/response_status_count/200': 45155,
 'elapsed_time_seconds': 281.874263,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 12, 21, 13, 35

In [15]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2017_splits.csv")

In [16]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45155 entries, 0 to 45154
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         45155 non-null  object
 1   race_state  45155 non-null  object
 2   last_split  39692 non-null  object
 3   k_5         45155 non-null  object
 4   k_10        45155 non-null  object
 5   k_15        45155 non-null  object
 6   k_20        45155 non-null  object
 7   k_half      45155 non-null  object
 8   k_25        45155 non-null  object
 9   k_30        45155 non-null  object
 10  k_35        45155 non-null  object
 11  k_40        45155 non-null  object
 12  k_finish    45155 non-null  object
dtypes: object(13)
memory usage: 4.5+ MB


#### Full Dataset for London 2017

In [43]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2017_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2017_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [44]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45155 entries, 0 to 45154
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      45155 non-null  int64 
 1   age_cat     45154 non-null  object
 2   gender      45155 non-null  object
 3   half        39469 non-null  object
 4   finish      39282 non-null  object
 5   idp         45155 non-null  object
 6   race_state  45155 non-null  object
 7   last_split  39692 non-null  object
 8   k_5         45155 non-null  object
 9   k_10        45155 non-null  object
 10  k_15        45155 non-null  object
 11  k_20        45155 non-null  object
 12  k_half      45155 non-null  object
 13  k_25        45155 non-null  object
 14  k_30        45155 non-null  object
 15  k_35        45155 non-null  object
 16  k_40        45155 non-null  object
 17  k_finish    45155 non-null  object
dtypes: int64(1), object(17)
memory usage: 6.2+ MB


In [45]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [46]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45155 entries, 0 to 45154
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          45155 non-null  int64 
 1   age_cat         45154 non-null  object
 2   gender          45155 non-null  object
 3   half            39469 non-null  object
 4   finish          39282 non-null  object
 5   idp             45155 non-null  object
 6   race_state      45155 non-null  object
 7   last_split      39692 non-null  object
 8   k_5_time        45155 non-null  object
 9   k_5_pace        45155 non-null  object
 10  k_5_speed       45155 non-null  object
 11  k_10_time       45155 non-null  object
 12  k_10_pace       45155 non-null  object
 13  k_10_speed      45155 non-null  object
 14  k_15_time       45155 non-null  object
 15  k_15_pace       45155 non-null  object
 16  k_15_speed      45155 non-null  object
 17  k_20_time       45155 non-null  object
 18  k_20_p

In [47]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,401,18-39,W,01:29:48,03:37:39,9999990F5ECC850000247AC9,Finished,Finish,00:20:09,04:02,...,11.93,02:39:22,05:13,11.51,03:24:13,08:59,6.69,03:37:39,06:08,9.80
1,402,18-39,W,01:35:52,03:16:16,9999990F5ECC850000247ACA,Finished,Finish,00:22:35,04:31,...,12.69,02:41:34,04:47,12.54,03:05:32,04:48,12.52,03:16:16,04:54,12.27
2,403,18-39,W,01:34:22,03:31:39,9999990F5ECC850000247ACB,Finished,Finish,00:21:20,04:16,...,10.21,02:51:51,05:50,10.30,03:18:43,05:23,11.17,03:31:39,05:54,10.18
3,404,18-39,W,01:46:03,03:29:27,9999990F5ECC850000247ACC,Finished,Finish,00:25:09,05:02,...,12.02,02:55:36,04:59,12.05,03:19:37,04:49,12.49,03:29:27,04:29,13.39
4,405,40-44,W,,,9999990F5ECC850000247ACD,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-


In [22]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2017_full.csv", index=False)

In [48]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2018

In [49]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_18}"

#### Result Pages

In [50]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_18, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 29 || Women Pages: 20
London 2018 total results pages: 49
Example URLs: 
 https://results.tcslondonmarathon.com/2018/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2018/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2018/London2018_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1418, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: e913bb6ca0335b2c


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (47667 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2018/London2018_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 14192,
 'downloader/request_count': 49,
 'downloader/request_method_count/GET': 49,
 'downloader/response_bytes': 3436846,
 'downloader/response_count': 49,
 'downloader/response_status_count/200': 49,
 'elapsed_time_seconds': 22.011757,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 12, 21, 19, 9, 980745),
 'httpcompression/response_bytes': 36594270,
 'httpcompression/response_count': 49,
 'item_scraped_count': 47667,
 'log_count/INFO': 12,
 'memusage/max': 405831680,
 'memusage/startup': 405831680,
 'response_received_count': 49,
 'scheduler/dequeued': 49,
 'scheduler/dequeued/memory': 49,
 'scheduler/enqueued': 49,
 'scheduler/enqueued/memory': 49,
 'star

In [27]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2018_res.csv")

In [28]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,32929,45-49,M,01:34:41,03:18:32,9999990F5ECC8E00002903A9
1,18787,18-39,M,,,9999990F5ECC8E0000291433
2,17454,45-49,M,01:44:17,04:44:50,9999990F5ECC8E000029197A
3,35750,18-39,M,02:31:31,05:34:25,9999990F5ECC8E0000293E8C
4,35751,18-39,M,02:31:31,05:34:25,9999990F5ECC8E0000293E8D


In [29]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47667 entries, 0 to 47666
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   47667 non-null  int64 
 1   age_cat  47652 non-null  object
 2   gender   47667 non-null  object
 3   half     40447 non-null  object
 4   finish   40096 non-null  object
 5   idp      47667 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.2+ MB


#### Splits Pages

In [51]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2018_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_18, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2018 total splits pages: 47667
Example URLs: 
 https://results.tcslondonmarathon.com/2018/?content=detail&fpid=search&pid=search&idp=9999990F5ECC8E00002903A9&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2018/?content=detail&fpid=search&pid=search&idp=9999990F5ECC8E000028FAFA&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2018/London2018_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [31]:
run_spider(london_spiders.LondonSpider1418, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 96ff49209477e2b3


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9672 pages (at 9672 pages/min), scraped 9656 items (at 9656 items/min)
INFO: Crawled 19683 pages (at 10011 pages/min), scraped 19664 items (at 10008 items/min)
INFO: Crawled 29781 pages (at 10098 pages/min), scraped 29761 items (at 10097 items/min)
INFO: Crawled 39788 pages (at 10007 pages/min), scraped 39765 items (at 10004 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (47667 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2018/London2018_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 16445115,
 'downloader/request_count': 47667,
 'downloader/request_method_count/GET': 47667,
 'downloader/response_bytes': 289217363,
 'downloader/response_count': 47667,
 'downloader/response_status_count/200': 47667,
 'elapsed_time_seconds': 288.338963,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 12, 21, 

In [32]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2018_splits.csv")

In [33]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47667 entries, 0 to 47666
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         47667 non-null  object
 1   race_state  47667 non-null  object
 2   last_split  40773 non-null  object
 3   k_5         47667 non-null  object
 4   k_10        47667 non-null  object
 5   k_15        47667 non-null  object
 6   k_20        47667 non-null  object
 7   k_half      47667 non-null  object
 8   k_25        47667 non-null  object
 9   k_30        47667 non-null  object
 10  k_35        47667 non-null  object
 11  k_40        47667 non-null  object
 12  k_finish    47667 non-null  object
dtypes: object(13)
memory usage: 4.7+ MB


#### Full Dataset for London 2018

In [52]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2018_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2018_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [53]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47667 entries, 0 to 47666
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      47667 non-null  int64 
 1   age_cat     47652 non-null  object
 2   gender      47667 non-null  object
 3   half        40447 non-null  object
 4   finish      40096 non-null  object
 5   idp         47667 non-null  object
 6   race_state  47667 non-null  object
 7   last_split  40773 non-null  object
 8   k_5         47667 non-null  object
 9   k_10        47667 non-null  object
 10  k_15        47667 non-null  object
 11  k_20        47667 non-null  object
 12  k_half      47667 non-null  object
 13  k_25        47667 non-null  object
 14  k_30        47667 non-null  object
 15  k_35        47667 non-null  object
 16  k_40        47667 non-null  object
 17  k_finish    47667 non-null  object
dtypes: int64(1), object(17)
memory usage: 6.5+ MB


In [54]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [55]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47667 entries, 0 to 47666
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          47667 non-null  int64 
 1   age_cat         47652 non-null  object
 2   gender          47667 non-null  object
 3   half            40447 non-null  object
 4   finish          40096 non-null  object
 5   idp             47667 non-null  object
 6   race_state      47667 non-null  object
 7   last_split      40773 non-null  object
 8   k_5_time        47667 non-null  object
 9   k_5_pace        47667 non-null  object
 10  k_5_speed       47667 non-null  object
 11  k_10_time       47667 non-null  object
 12  k_10_pace       47667 non-null  object
 13  k_10_speed      47667 non-null  object
 14  k_15_time       47667 non-null  object
 15  k_15_pace       47667 non-null  object
 16  k_15_speed      47667 non-null  object
 17  k_20_time       47667 non-null  object
 18  k_20_p

In [56]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,261,70-74,W,02:08:22,04:44:49,9999990F5ECC8E000029A362,Finished,Finish,00:29:05,05:49,...,8.41,03:49:36,07:38,7.87,04:28:08,07:43,7.79,04:44:49,07:37,7.89
1,401,18-39,W,,,9999990F5ECC8E0000290ECD,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
2,402,18-39,W,01:33:13,03:15:10,9999990F5ECC8E0000290ECE,Finished,Finish,00:21:02,04:13,...,12.62,02:39:50,04:57,12.13,03:04:32,04:57,12.15,03:15:10,04:51,12.39
3,403,45-49,W,,,9999990F5ECC8E0000290ECF,Not Started,,'-','-',...,-,'-','-',-,'-','-',-,'-','-',-
4,404,40-44,W,01:40:39,03:27:43,9999990F5ECC8E0000290ED0,Finished,Finish,00:23:41,04:45,...,12.18,02:50:22,05:11,11.58,03:15:57,05:07,11.73,03:27:43,05:22,11.19


In [39]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2018_full.csv", index=False)

In [57]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2019

In [13]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_19}"

#### Results Pages

In [14]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_19, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 29 || Women Pages: 21
London 2019 total results pages: 50
Example URLs: 
 https://results.tcslondonmarathon.com/2019/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2019/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2019/London2019_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [15]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1923, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: ce057d784f4cf88c


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (49318 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2019/London2019_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 14482,
 'downloader/request_count': 50,
 'downloader/request_method_count/GET': 50,
 'downloader/response_bytes': 4717673,
 'downloader/response_count': 50,
 'downloader/response_status_count/200': 50,
 'elapsed_time_seconds': 24.909375,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 16, 21, 337006),
 'httpcompression/response_bytes': 101990400,
 'httpcompression/response_count': 50,
 'item_scraped_count': 49318,
 'log_count/INFO': 12,
 'memusage/max': 337575936,
 'memusage/startup': 337575936,
 'response_received_count': 50,
 'scheduler/dequeued': 50,
 'scheduler/dequeued/memory': 50,
 'scheduler/enqueued': 50,
 'scheduler/enqueued/memory': 50,
 'st

In [16]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2019_res.csv")

In [12]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,29714,18-39,M,01:13:27,02:27:32,9999990F5ECC9700002C4389
1,37402,18-39,M,01:44:00,04:16:02,9999990F5ECC9700002C65FE
2,20195,40-44,M,,,9999990F5ECC9700002C3AB4
3,15271,18-39,M,02:05:19,04:12:49,9999990F5ECC9700002C2B5A
4,16509,50-54,M,01:59:03,04:49:38,9999990F5ECC9700002C2DED


In [13]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49318 entries, 0 to 49317
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   49318 non-null  int64 
 1   age_cat  49318 non-null  object
 2   gender   49318 non-null  object
 3   half     42597 non-null  object
 4   finish   42562 non-null  object
 5   idp      49318 non-null  object
dtypes: int64(1), object(5)
memory usage: 2.3+ MB


#### Splits Pages

In [17]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2019_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_19, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2019 total splits pages: 49318
Example URLs: 
 https://results.tcslondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002C36BD&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=9999990F5ECC9700002CCC11&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2019/London2019_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1923, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: c2d312e82a07ae55


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (100 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2019/London2019_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 34500,
 'downloader/request_count': 100,
 'downloader/request_method_count/GET': 100,
 'downloader/response_bytes': 813736,
 'downloader/response_count': 100,
 'downloader/response_status_count/200': 100,
 'elapsed_time_seconds': 1.007771,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 16, 48, 456290),
 'httpcompression/response_bytes': 2904065,
 'httpcompression/response_count': 100,
 'item_scraped_count': 100,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 100,
 'scheduler/dequeued': 100,
 'scheduler/dequeued/memory': 100,
 'scheduler/enqueued': 100,
 'scheduler/enqueued/memory': 100,

In [16]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2019_splits.csv")

In [17]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49318 entries, 0 to 49317
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         49318 non-null  object
 1   race_state  49318 non-null  object
 2   last_split  42840 non-null  object
 3   k_5         49318 non-null  object
 4   k_10        49318 non-null  object
 5   k_15        49318 non-null  object
 6   k_20        49318 non-null  object
 7   k_half      49318 non-null  object
 8   k_25        49318 non-null  object
 9   k_30        49318 non-null  object
 10  k_35        49318 non-null  object
 11  k_40        49318 non-null  object
 12  k_finish    49318 non-null  object
dtypes: object(13)
memory usage: 4.9+ MB


#### Full Dataset for London 2019

In [63]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2019_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2019_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [64]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49318 entries, 0 to 49317
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      49318 non-null  int64 
 1   age_cat     49318 non-null  object
 2   gender      49318 non-null  object
 3   half        42597 non-null  object
 4   finish      42562 non-null  object
 5   idp         49318 non-null  object
 6   race_state  49318 non-null  object
 7   last_split  42840 non-null  object
 8   k_5         49318 non-null  object
 9   k_10        49318 non-null  object
 10  k_15        49318 non-null  object
 11  k_20        49318 non-null  object
 12  k_half      49318 non-null  object
 13  k_25        49318 non-null  object
 14  k_30        49318 non-null  object
 15  k_35        49318 non-null  object
 16  k_40        49318 non-null  object
 17  k_finish    49318 non-null  object
dtypes: int64(1), object(17)
memory usage: 6.8+ MB


In [65]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [66]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49318 entries, 0 to 49317
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          49318 non-null  int64 
 1   age_cat         49318 non-null  object
 2   gender          49318 non-null  object
 3   half            42597 non-null  object
 4   finish          42562 non-null  object
 5   idp             49318 non-null  object
 6   race_state      49318 non-null  object
 7   last_split      42840 non-null  object
 8   k_5_time        49318 non-null  object
 9   k_5_pace        49318 non-null  object
 10  k_5_speed       49318 non-null  object
 11  k_10_time       49318 non-null  object
 12  k_10_pace       49318 non-null  object
 13  k_10_speed      49318 non-null  object
 14  k_15_time       49318 non-null  object
 15  k_15_pace       49318 non-null  object
 16  k_15_speed      49318 non-null  object
 17  k_20_time       49318 non-null  object
 18  k_20_p

In [67]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,401,18-39,W,01:35:22,03:16:27,9999990F5ECC9700002C2E1F,Finished,Finish,00:22:23,04:29,...,12.37,02:42:05,04:55,12.24,03:06:03,04:48,12.52,03:16:27,04:45,12.66
1,403,18-39,W,01:29:44,03:07:05,9999990F5ECC9700002C2E20,Finished,Finish,00:21:11,04:15,...,13.68,02:31:12,04:36,13.06,02:56:10,05:00,12.02,03:07:05,04:59,12.06
2,405,18-39,W,01:29:50,03:11:18,9999990F5ECC9700002C2E21,Finished,Finish,00:20:15,04:03,...,12.69,02:35:38,04:54,12.27,03:00:06,04:54,12.26,03:11:18,05:07,11.76
3,406,50-54,W,,,9999990F5ECC9700002C2E22,Not Started,,-,-,...,-,-,-,-,-,-,-,-,-,-
4,407,50-54,W,,,9999990F5ECC9700002C2E23,Not Started,,-,-,...,-,-,-,-,-,-,-,-,-,-


In [23]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2019_full.csv", index=False)

In [68]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2021

In [69]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_21}"

#### Results Pages

In [70]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_21, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 25 || Women Pages: 17
London 2021 total results pages: 42
Example URLs: 
 https://results.tcslondonmarathon.com/2021/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2021/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2021/London2021_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1923, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 63146b3b6a40634c
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 34 pages (at 34 pages/min), scraped 33878 items (at 33878 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (41594 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2021/London2021_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 12162,
 'downloader/request_count': 42,
 'downloader/request_method_count/GET': 42,
 'downloader/response_bytes': 4816142,
 'downloader/response_count': 42,
 'downloader/response_status_count/200': 42,
 'elapsed_time_seconds': 65.663932,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 13, 12, 24, 46, 264781),
 'httpcompression/response_bytes': 93854390,
 'httpcompression/response_count': 42,
 'item_scraped_count': 41594,
 'log_count/INFO': 13,
 'memusage/max': 625573888,
 'memusage/startup': 237027328,
 'response_received_count': 42,
 'scheduler/dequeued': 42,
 'scheduler/deque

In [10]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2021_res.csv")

In [11]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,40546.0,18-39,M,01:44:43,03:59:33,T8C2O3HQ31F33D
1,23235.0,45-49,M,02:32:05,06:22:20,T8C2O3HQ31E95A
2,52134.0,50-54,M,,,T8C2O3HQ31E2DA
3,24700.0,40-44,M,01:23:26,,T8C2O3HQ323DAC
4,7204.0,55-59,M,02:39:20,06:18:08,T8C2O3HQ326EB8


In [12]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41594 entries, 0 to 41593
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   run_no   41568 non-null  float64
 1   age_cat  41589 non-null  object 
 2   gender   41594 non-null  object 
 3   half     36034 non-null  object 
 4   finish   35891 non-null  object 
 5   idp      41594 non-null  object 
dtypes: float64(1), object(5)
memory usage: 1.9+ MB


#### Splits Pages

In [71]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2021_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_21, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2021 total splits pages: 41594
Example URLs: 
 https://results.tcslondonmarathon.com/2021/?content=detail&fpid=search&pid=search&idp=T8C2O3HQ31F33D&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2021/?content=detail&fpid=search&pid=search&idp=T8C2O3HQ320EB9&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2021/London2021_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1923, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: c8e2e7043a64149e
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 6622 pages (at 6622 pages/min), scraped 6607 items (at 6607 items/min)
INFO: Crawled 13390 pages (at 6768 pages/min), scraped 13374 items (at 6767 items/min)
INFO: Crawled 20069 pages (at 6679 pages/min), scraped 20062 items (at 6688 items/min)
INFO: Crawled 26549 pages (at 6480 pages/min), scraped 26541 items (at 6479 items/min)
INFO: Crawled 33368 pages (at 6819 pages/min), scraped 33356 items (at 6815 items/min)
INFO: Crawled 40315 pages (at 6947 pages/min), scraped 40315 items (at 6959 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (41594 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2021/London2021_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13933990,
 'downloader/request_count': 41594,
 'downloader/request_method_count/GET': 41594,
 'downloader/response_bytes': 341643725,
 'downloader/response_count': 41594,
 'downloader/response_status_count/200': 41594

In [15]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2021_splits.csv")

In [16]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41594 entries, 0 to 41593
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         41594 non-null  object
 1   race_state  41594 non-null  object
 2   last_split  36149 non-null  object
 3   k_5         41594 non-null  object
 4   k_10        41594 non-null  object
 5   k_15        41594 non-null  object
 6   k_20        41594 non-null  object
 7   k_half      41594 non-null  object
 8   k_25        41594 non-null  object
 9   k_30        41594 non-null  object
 10  k_35        41594 non-null  object
 11  k_40        41594 non-null  object
 12  k_finish    41594 non-null  object
dtypes: object(13)
memory usage: 4.1+ MB


#### Full Dataset for London 2021

In [72]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2021_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2021_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [73]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41594 entries, 0 to 41593
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   run_no      41568 non-null  float64
 1   age_cat     41589 non-null  object 
 2   gender      41594 non-null  object 
 3   half        36034 non-null  object 
 4   finish      35891 non-null  object 
 5   idp         41594 non-null  object 
 6   race_state  41594 non-null  object 
 7   last_split  36149 non-null  object 
 8   k_5         41594 non-null  object 
 9   k_10        41594 non-null  object 
 10  k_15        41594 non-null  object 
 11  k_20        41594 non-null  object 
 12  k_half      41594 non-null  object 
 13  k_25        41594 non-null  object 
 14  k_30        41594 non-null  object 
 15  k_35        41594 non-null  object 
 16  k_40        41594 non-null  object 
 17  k_finish    41594 non-null  object 
dtypes: float64(1), object(17)
memory usage: 5.7+ MB


In [74]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [75]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41594 entries, 0 to 41593
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          41568 non-null  float64
 1   age_cat         41589 non-null  object 
 2   gender          41594 non-null  object 
 3   half            36034 non-null  object 
 4   finish          35891 non-null  object 
 5   idp             41594 non-null  object 
 6   race_state      41594 non-null  object 
 7   last_split      36149 non-null  object 
 8   k_5_time        41594 non-null  object 
 9   k_5_pace        41594 non-null  object 
 10  k_5_speed       41594 non-null  object 
 11  k_10_time       41594 non-null  object 
 12  k_10_pace       41594 non-null  object 
 13  k_10_speed      41594 non-null  object 
 14  k_15_time       41594 non-null  object 
 15  k_15_pace       41594 non-null  object 
 16  k_15_speed      41594 non-null  object 
 17  k_20_time       41594 non-null 

In [76]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,186.0,18-39,W,01:34:26,03:13:25,T8C2O3HQ324725,Finished,Finish,00:21:42,04:21,...,13.09,02:38:46,04:46,12.60,03:02:53,04:50,12.44,03:13:25,04:48,12.50
1,188.0,18-39,W,,,T8C2O3HQ324721,Not Started,,-,-,...,-,-,-,-,-,-,-,-,-,-
2,190.0,18-39,W,,,T8C2O3HQ32471D,Not Started,,-,-,...,-,-,-,-,-,-,-,-,-,-
3,191.0,40-44,W,01:33:55,03:15:41,T8C2O3HQ324714,Finished,Finish,00:21:52,04:23,...,12.71,02:39:59,04:55,12.24,03:04:51,04:59,12.06,03:15:41,04:57,12.16
4,192.0,18-39,W,,,T8C2O3HQ324712,Not Started,,-,-,...,-,-,-,-,-,-,-,-,-,-


In [23]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2021_full.csv", index=False)

In [77]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2022

In [78]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_22}"

#### Results Pages

In [79]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_22, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 28 || Women Pages: 20
London 2022 total results pages: 48
Example URLs: 
 https://results.tcslondonmarathon.com/2022/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2022/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2022/London2022_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [27]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1923, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: b0bfc6e940979abf


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (46993 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2022/London2022_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 13902,
 'downloader/request_count': 48,
 'downloader/request_method_count/GET': 48,
 'downloader/response_bytes': 5176075,
 'downloader/response_count': 48,
 'downloader/response_status_count/200': 48,
 'elapsed_time_seconds': 30.162948,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 13, 12, 44, 8, 485788),
 'httpcompression/response_bytes': 105042990,
 'httpcompression/response_count': 48,
 'item_scraped_count': 46993,
 'log_count/INFO': 12,
 'memusage/max': 625573888,
 'memusage/startup': 625573888,
 'response_received_count': 48,
 'scheduler/dequeued': 48,
 'scheduler/dequeued/memory': 48,
 'scheduler/enqueued': 48,
 'scheduler/enqueued/memory': 48,
 'sta

In [28]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2022_res.csv")

In [31]:
df_ldn_res.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,43252.0,60-64,M,01:56:13,03:56:13,T8C2O3HQ39DA47
1,31230.0,55-59,M,01:55:19,04:09:15,T8C2O3HQ3A38CA
2,3707.0,50-54,M,02:01:23,04:46:02,T8C2O3HQ3A1855
3,48414.0,50-54,M,02:29:51,05:46:00,T8C2O3HQ3A531B
4,35597.0,18-39,M,01:57:26,04:13:11,T8C2O3HQ3A50DA


In [32]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46993 entries, 0 to 46992
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   run_no   46984 non-null  float64
 1   age_cat  46987 non-null  object 
 2   gender   46993 non-null  object 
 3   half     40708 non-null  object 
 4   finish   40618 non-null  object 
 5   idp      46993 non-null  object 
dtypes: float64(1), object(5)
memory usage: 2.2+ MB


#### Splits Pages

In [80]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2022_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_22, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2022 total splits pages: 46993
Example URLs: 
 https://results.tcslondonmarathon.com/2022/?content=detail&fpid=search&pid=search&idp=T8C2O3HQ39DA47&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2022/?content=detail&fpid=search&pid=search&idp=T8C2O3HQ3A72B0&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2022/London2022_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
# run_spider(london_spiders.LondonSpider1418, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)
run_spider(london_spiders.LondonSpider1923, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: ef941ad197b3c74f


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 7059 pages (at 7059 pages/min), scraped 7045 items (at 7045 items/min)
INFO: Crawled 13993 pages (at 6934 pages/min), scraped 13980 items (at 6935 items/min)
INFO: Crawled 21120 pages (at 7127 pages/min), scraped 21097 items (at 7117 items/min)
INFO: Crawled 28155 pages (at 7035 pages/min), scraped 28141 items (at 7044 items/min)
INFO: Crawled 35135 pages (at 6980 pages/min), scraped 35118 items (at 6977 items/min)
INFO: Crawled 41932 pages (at 6797 pages/min), scraped 41910 items (at 6792 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (46993 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2022/London2022_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15742655,
 'downloader/request_count': 46993,
 'downloader/request_method_count/GET': 46993,
 'downloader/response_bytes': 382880986,
 'downloader/response_count': 46993,
 'downloader/response_status_count/200': 46993

In [35]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2022_splits.csv")

In [36]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46993 entries, 0 to 46992
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         46993 non-null  object
 1   race_state  46993 non-null  object
 2   last_split  40858 non-null  object
 3   k_5         46993 non-null  object
 4   k_10        46993 non-null  object
 5   k_15        46993 non-null  object
 6   k_20        46993 non-null  object
 7   k_half      46993 non-null  object
 8   k_25        46993 non-null  object
 9   k_30        46993 non-null  object
 10  k_35        46993 non-null  object
 11  k_40        46993 non-null  object
 12  k_finish    46993 non-null  object
dtypes: object(13)
memory usage: 4.7+ MB


#### Full Dataset for London 2022

In [81]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2022_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2022_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [82]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46993 entries, 0 to 46992
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   run_no      46984 non-null  float64
 1   age_cat     46987 non-null  object 
 2   gender      46993 non-null  object 
 3   half        40708 non-null  object 
 4   finish      40618 non-null  object 
 5   idp         46993 non-null  object 
 6   race_state  46993 non-null  object 
 7   last_split  40858 non-null  object 
 8   k_5         46993 non-null  object 
 9   k_10        46993 non-null  object 
 10  k_15        46993 non-null  object 
 11  k_20        46993 non-null  object 
 12  k_half      46993 non-null  object 
 13  k_25        46993 non-null  object 
 14  k_30        46993 non-null  object 
 15  k_35        46993 non-null  object 
 16  k_40        46993 non-null  object 
 17  k_finish    46993 non-null  object 
dtypes: float64(1), object(17)
memory usage: 6.5+ MB


In [83]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [84]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46993 entries, 0 to 46992
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          46984 non-null  float64
 1   age_cat         46987 non-null  object 
 2   gender          46993 non-null  object 
 3   half            40708 non-null  object 
 4   finish          40618 non-null  object 
 5   idp             46993 non-null  object 
 6   race_state      46993 non-null  object 
 7   last_split      40858 non-null  object 
 8   k_5_time        46993 non-null  object 
 9   k_5_pace        46993 non-null  object 
 10  k_5_speed       46993 non-null  object 
 11  k_10_time       46993 non-null  object 
 12  k_10_pace       46993 non-null  object 
 13  k_10_speed      46993 non-null  object 
 14  k_15_time       46993 non-null  object 
 15  k_15_pace       46993 non-null  object 
 16  k_15_speed      46993 non-null  object 
 17  k_20_time       46993 non-null 

In [85]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,300.0,45-49,W,01:18:08,02:41:40,T8C2O3HQ39D7EC,Finished,Finish,00:18:17,03:40,...,15.83,02:11:07,03:56,15.29,02:31:53,04:10,14.45,02:41:40,04:28,13.46
1,301.0,40-44,W,01:21:11,02:43:28,T8C2O3HQ39B628,Finished,Finish,00:19:06,03:50,...,15.46,02:15:18,03:57,15.24,02:34:59,03:57,15.24,02:43:28,03:52,15.52
2,302.0,40-44,W,01:31:22,03:30:33,T8C2O3HQ3A6B46,Finished,Finish,00:20:36,04:08,...,10.45,02:52:25,06:04,9.91,03:20:35,05:38,10.65,03:30:33,04:33,13.21
3,304.0,45-49,W,01:23:53,02:49:24,T8C2O3HQ3A644C,Finished,Finish,00:19:43,03:57,...,14.93,02:20:03,04:07,14.60,02:40:43,04:08,14.52,02:49:24,03:58,15.17
4,305.0,45-49,W,01:28:40,,T8C2O3HQ3A6694,Started,30K,00:20:50,04:10,...,13.11,-,-,-,-,-,-,-,-,-


In [42]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2022_full.csv", index=False)

In [86]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

### 2023

In [87]:
# Where the csv data will be stored.
ldn_data_path = f"Marathons_Data/Raw/London/London{YEAR_23}"

#### Results Pages

In [88]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ldn_pages_urls, ldn_res_settings = london.gen_res_scrap_info(YEAR_23, LDN_NUM_RESULTS, LDN_RES_FIELDS, 
                                                             ldn_data_path, show_settings=True)

Men Pages: 32 || Women Pages: 22
London 2023 total results pages: 54
Example URLs: 
 https://results.tcslondonmarathon.com/2023/?page=1&event=MAS&pid=search&sex=M&num_results=1000 
 https://results.tcslondonmarathon.com/2023/?page=1&event=MAS&pid=search&sex=W&num_results=1000
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2023/London2023_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'half', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
# The logs are shown as a visual guide to know when the crawler finished.
run_spider(london_spiders.LondonSpider1923, urls=ldn_pages_urls, settings=ldn_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 177068123d240423


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (53077 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2023/London2023_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15642,
 'downloader/request_count': 54,
 'downloader/request_method_count/GET': 54,
 'downloader/response_bytes': 5845274,
 'downloader/response_count': 54,
 'downloader/response_status_count/200': 54,
 'elapsed_time_seconds': 55.078509,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 13, 13, 34, 7, 727013),
 'httpcompression/response_bytes': 119217148,
 'httpcompression/response_count': 54,
 'item_scraped_count': 53077,
 'log_count/INFO': 12,
 'memusage/max': 1269514240,
 'memusage/startup': 1269514240,
 'response_received_count': 54,
 'scheduler/dequeued': 54,
 'scheduler/dequeued/memory': 54,
 'scheduler/enqueued': 54,
 'scheduler/enqueued/memory': 54,
 's

In [47]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2023_res.csv")

In [49]:
df_ldn_res

Unnamed: 0,run_no,age_cat,gender,half,finish,idp
0,42293.0,18-39,M,01:27:25,02:53:54,T8C2O3HQ40137E
1,42319.0,18-39,M,01:27:26,02:53:54,T8C2O3HQ3FC6B1
2,22818.0,45-49,M,02:30:31,05:00:25,T8C2O3HQ3F9292
3,13166.0,18-39,M,01:51:41,03:51:43,T8C2O3HQ3FF251
4,12349.0,18-39,M,01:49:50,03:58:19,T8C2O3HQ3FD463
...,...,...,...,...,...,...
53072,13243.0,18-39,W,02:02:27,04:10:16,T8C2O3HQ3FD35C
53073,22194.0,45-49,W,,,T8C2O3HQ3F37D3
53074,38943.0,40-44,W,02:14:18,04:31:59,T8C2O3HQ40218D
53075,19656.0,18-39,W,02:01:02,04:12:39,T8C2O3HQ3F5CB3


In [50]:
df_ldn_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53077 entries, 0 to 53076
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   run_no   53072 non-null  float64
 1   age_cat  53076 non-null  object 
 2   gender   53077 non-null  object 
 3   half     48653 non-null  object 
 4   finish   48670 non-null  object 
 5   idp      53077 non-null  object 
dtypes: float64(1), object(5)
memory usage: 2.4+ MB


#### Splits Pages

In [89]:
df_ldn_res = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2023_res.csv")
ldn_splits_urls, ldn_splits_settings = london.gen_splits_scrap_info(YEAR_23, df_ldn_res["idp"].to_list(), LDN_SPLITS_FIELDS, 
                                                                    ldn_data_path, show_settings=True)

London 2023 total splits pages: 53077
Example URLs: 
 https://results.tcslondonmarathon.com/2023/?content=detail&fpid=search&pid=search&idp=T8C2O3HQ40137E&lang=EN_CAP&event=MAS&&search_event=MAS 
 https://results.tcslondonmarathon.com/2023/?content=detail&fpid=search&pid=search&idp=T8C2O3HQ3F6682&lang=EN_CAP&event=MAS&&search_event=MAS
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2023/London2023_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(london_spiders.LondonSpider1923, urls=ldn_splits_urls, settings=ldn_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 76972c3294e0306d
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 7112 pages (at 7112 pages/min), scraped 7105 items (at 7105 items/min)
INFO: Crawled 14289 pages (at 7177 pages/min), scraped 14275 items (at 7170 items/min)
INFO: Crawled 21172 pages (at 6883 pages/min), scraped 21157 items (at 6882 items/min)
INFO: Crawled 28333 pages (at 7161 pages/min), scraped 28318 items (at 7161 items/min)
INFO: Crawled 35778 pages (at 7445 pages/min), scraped 35767 items (at 7449 items/min)
INFO: Crawled 43551 pages (at 7773 pages/min), scraped 43538 items (at 7771 items/min)
INFO: Crawled 51500 pages (at 7949 pages/min), scraped 51485 items (at 7947 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (53077 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/London/London2023/London2023_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 17780795,
 'downloader/request_count': 53077,
 'downloader/request_method_count/GET': 53077,
 'downloader/response_bytes': 4370382

In [54]:
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2023_splits.csv")

In [55]:
df_ldn_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53077 entries, 0 to 53076
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         53077 non-null  object
 1   race_state  53077 non-null  object
 2   last_split  49122 non-null  object
 3   k_5         53075 non-null  object
 4   k_10        53075 non-null  object
 5   k_15        53075 non-null  object
 6   k_20        53075 non-null  object
 7   k_half      53075 non-null  object
 8   k_25        53075 non-null  object
 9   k_30        53075 non-null  object
 10  k_35        53075 non-null  object
 11  k_40        53075 non-null  object
 12  k_finish    53075 non-null  object
dtypes: object(13)
memory usage: 5.3+ MB


#### Full Dataset for London 2023

In [90]:
df_ldn_res    = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2023_res.csv")
df_ldn_splits = pd.read_csv(ldn_data_path+f"/{LDN_NAME}2023_splits.csv")
df_ldn_full   = pd.merge(df_ldn_res, df_ldn_splits, on="idp")

In [91]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53077 entries, 0 to 53076
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   run_no      53072 non-null  float64
 1   age_cat     53076 non-null  object 
 2   gender      53077 non-null  object 
 3   half        48653 non-null  object 
 4   finish      48670 non-null  object 
 5   idp         53077 non-null  object 
 6   race_state  53077 non-null  object 
 7   last_split  49122 non-null  object 
 8   k_5         53075 non-null  object 
 9   k_10        53075 non-null  object 
 10  k_15        53075 non-null  object 
 11  k_20        53075 non-null  object 
 12  k_half      53075 non-null  object 
 13  k_25        53075 non-null  object 
 14  k_30        53075 non-null  object 
 15  k_35        53075 non-null  object 
 16  k_40        53075 non-null  object 
 17  k_finish    53075 non-null  object 
dtypes: float64(1), object(17)
memory usage: 7.3+ MB


In [92]:
df_ldn_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time, pace(min/km) , and speed(km/h)
df_ldn_full = expand_splits(df_ldn_full)
# Drop the splits Lists.
df_ldn_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [93]:
df_ldn_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53077 entries, 0 to 53076
Data columns (total 38 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          53072 non-null  float64
 1   age_cat         53076 non-null  object 
 2   gender          53077 non-null  object 
 3   half            48653 non-null  object 
 4   finish          48670 non-null  object 
 5   idp             53077 non-null  object 
 6   race_state      53077 non-null  object 
 7   last_split      49122 non-null  object 
 8   k_5_time        53075 non-null  object 
 9   k_5_pace        53075 non-null  object 
 10  k_5_speed       53075 non-null  object 
 11  k_10_time       53075 non-null  object 
 12  k_10_pace       53075 non-null  object 
 13  k_10_speed      53075 non-null  object 
 14  k_15_time       53075 non-null  object 
 15  k_15_pace       53075 non-null  object 
 16  k_15_speed      53075 non-null  object 
 17  k_20_time       53075 non-null 

In [94]:
df_ldn_full.head()

Unnamed: 0,run_no,age_cat,gender,half,finish,idp,race_state,last_split,k_5_time,k_5_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,321.0,18-39,M,02:47:24,06:27:20,T8C2O3HQ3FA1C0,Finished,Finish,00:33:23,06:41,...,5.40,05:19:41,11:33,5.20,06:08:58,09:52,6.09,06:27:20,08:23,7.17
1,1001.0,18-39,M,01:15:02,02:32:13,T8C2O3HQ3FC6E5,Finished,Finish,00:17:40,03:32,...,16.90,02:04:44,03:37,16.62,02:23:30,03:46,15.99,02:32:13,03:59,15.11
2,1002.0,18-39,M,01:17:43,,T8C2O3HQ3F30E8,Started,25K,00:18:09,03:38,...,-,-,-,-,-,-,-,-,-,-
3,1003.0,18-39,W,01:24:29,02:55:06,T8C2O3HQ3FCCCB,Finished,Finish,00:19:35,03:55,...,14.25,02:23:05,04:19,13.91,02:45:39,04:31,13.29,02:55:06,04:19,13.94
4,1004.0,18-39,M,01:14:17,02:39:36,T8C2O3HQ3F68E1,Finished,Finish,00:17:01,03:25,...,14.93,02:09:40,04:08,14.53,02:30:43,04:13,14.25,02:39:36,04:03,14.83


In [61]:
df_ldn_full.to_csv(ldn_data_path+f"/{LDN_NAME}2023_full.csv", index=False)

In [95]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections. 
# Use as seen fit.
del df_ldn_full, df_ldn_res, df_ldn_splits, ldn_data_path

## Hamburg

In [3]:
HAM_NAME = "Hamburg"

# Hamburg URLs Templates.
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page.
HAMBURG_MARATHON_URL: str = "https://hamburg.r.mikatiming.com/{0}/?pid=search&page={1}&sex={2}&num_results={3}&event=HML&event_main_group=custom.meeting.marathon"
# {0}: Year || {1}: runner id
HAMBURG_MARATHON_SPLIT_URL: str = "https://hamburg.r.mikatiming.com/{0}/?content=detail&fpid=search&pid=search&idp={1}&lang=EN&event=HML&event_main_group=custom.meeting.marathon"


HAM_NUM_RESULTS: str = "500"

HAM_RES_FIELDS: list[str] = ["run_no", "age_cat", "gender", "finish", "idp"]
HAM_SPLITS_FIELDS: list[str] = ["idp", "k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]

In [4]:
# Initialising london marathon object. 
hamburg = HamburgMarathon(url_template=HAMBURG_MARATHON_URL, split_url_template=HAMBURG_MARATHON_SPLIT_URL)

### 2013

In [21]:
# Where the csv data will be stored.
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_13}"

#### Results Pages

In [22]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_13, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 24 || Women Pages: 7
Hamburg 2013 total results pages: 31
Example URLs: 
 https://hamburg.r.mikatiming.com/2013/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2013/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2013/Hamburg2013_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 8d744df1e923c178


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (15135 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2013/Hamburg2013_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 10059,
 'downloader/request_count': 31,
 'downloader/request_method_count/GET': 31,
 'downloader/response_bytes': 1366948,
 'downloader/response_count': 31,
 'downloader/response_status_count/200': 31,
 'elapsed_time_seconds': 3.941797,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 18, 38, 376564),
 'httpcompression/response_bytes': 17376117,
 'httpcompression/response_count': 31,
 'item_scraped_count': 15135,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 31,
 'scheduler/dequeued': 31,
 'scheduler/dequeued/memory': 31,
 'scheduler/enqueued': 31,
 'scheduler/enqueued/memory': 31,
 's

In [13]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2013_res.csv")

In [16]:
df_ham_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,34,,M,,000017074FFEDA000021B7B6
1,35,,M,,000017074FFEDA000021B7B9
2,4393,M/W 45 1964-1968,M,04:21:09,999999074FFED800001FF520
3,10327,M/W 50 1959-1963,M,,999999074FFED600001D8A74
4,1226,M/W 45 1964-1968,M,02:50:11,999999074FFED8000020A0B0


In [17]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15135 entries, 0 to 15134
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   15135 non-null  object
 1   age_cat  15133 non-null  object
 2   gender   15135 non-null  object
 3   finish   11460 non-null  object
 4   idp      15135 non-null  object
dtypes: object(5)
memory usage: 591.3+ KB


#### Splits Pages

In [24]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2013_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_13, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2013 total splits pages: 15135
Example URLs: 
 https://hamburg.r.mikatiming.com/2013/?content=detail&fpid=search&pid=search&idp=999999074FFED800001EF38C&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2013/?content=detail&fpid=search&pid=search&idp=999999074FFED800001F4580&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2013/Hamburg2013_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: f52731dc641d36b7


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (100 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2013/Hamburg2013_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 35900,
 'downloader/request_count': 100,
 'downloader/request_method_count/GET': 100,
 'downloader/response_bytes': 565307,
 'downloader/response_count': 100,
 'downloader/response_status_count/200': 100,
 'elapsed_time_seconds': 0.917563,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 18, 51, 518814),
 'httpcompression/response_bytes': 1803933,
 'httpcompression/response_count': 100,
 'item_scraped_count': 100,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 100,
 'scheduler/dequeued': 100,
 'scheduler/dequeued/memory': 100,
 'scheduler/enqueued': 100,
 'scheduler/enqueued/memory': 1

In [21]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2013_splits.csv")

In [22]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15135 entries, 0 to 15134
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       15135 non-null  object
 1   k_5       15125 non-null  object
 2   k_10      15125 non-null  object
 3   k_15      15125 non-null  object
 4   k_20      15125 non-null  object
 5   k_half    15125 non-null  object
 6   k_25      15125 non-null  object
 7   k_30      15125 non-null  object
 8   k_35      15125 non-null  object
 9   k_40      15125 non-null  object
 10  k_finish  15125 non-null  object
dtypes: object(11)
memory usage: 1.3+ MB


#### Full Raw Dataset for Hamburg 2013

In [8]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2013_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2013_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [9]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15135 entries, 0 to 15134
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    15135 non-null  object
 1   age_cat   15133 non-null  object
 2   gender    15135 non-null  object
 3   finish    11460 non-null  object
 4   idp       15135 non-null  object
 5   k_5       15125 non-null  object
 6   k_10      15125 non-null  object
 7   k_15      15125 non-null  object
 8   k_20      15125 non-null  object
 9   k_half    15125 non-null  object
 10  k_25      15125 non-null  object
 11  k_30      15125 non-null  object
 12  k_35      15125 non-null  object
 13  k_40      15125 non-null  object
 14  k_finish  15125 non-null  object
dtypes: object(15)
memory usage: 1.7+ MB


In [10]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [11]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15135 entries, 0 to 15134
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          15135 non-null  object
 1   age_cat         15133 non-null  object
 2   gender          15135 non-null  object
 3   finish          11460 non-null  object
 4   idp             15135 non-null  object
 5   k_5_time        15125 non-null  object
 6   k_5_pace        15125 non-null  object
 7   k_5_speed       15125 non-null  object
 8   k_10_time       15125 non-null  object
 9   k_10_pace       15125 non-null  object
 10  k_10_speed      15125 non-null  object
 11  k_15_time       15125 non-null  object
 12  k_15_pace       15125 non-null  object
 13  k_15_speed      15125 non-null  object
 14  k_20_time       15125 non-null  object
 15  k_20_pace       15125 non-null  object
 16  k_20_speed      15125 non-null  object
 17  k_half_time     15125 non-null  object
 18  k_half

In [12]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,MH/WH 1984-1995,M,02:05:30,000017074FFEDA000021B430,00:14:57,03:00,20.07,00:29:51,02:59,...,20.11,01:44:16,02:55,20.67,01:58:59,02:57,20.39,02:05:30,02:59,20.21
1,10,M/W 35 1974-1978,M,02:16:33,000017074FFEDA000021B48D,00:15:43,03:09,19.09,00:31:31,03:10,...,18.35,01:52:35,03:17,18.35,02:09:16,03:21,17.98,02:16:33,03:20,18.08
2,100,M/W 30 1979-1983,M,,000017074FFEDA000021B56D,00:14:56,03:00,20.09,00:29:51,02:59,...,-,-,-,-,-,-,-,-,-,-
3,10000,M/W 50 1959-1963,M,04:21:34,999999074FFED600001DC00C,00:28:11,05:39,10.64,00:55:58,05:34,...,9.47,03:30:08,06:52,8.75,04:06:23,07:15,8.28,04:21:34,06:56,8.67
4,10001,M/W 30 1979-1983,M,04:05:08,999999074FFED800001F9B5C,00:29:15,05:51,10.26,00:58:18,05:49,...,10.77,03:19:23,05:56,10.14,03:51:17,06:23,9.40,04:05:08,06:19,9.51


In [28]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2013_full.csv", index=False)

In [13]:
# Delete dataframe to avoid accidentally overwriting the CSV file in next sections.
# Use as seen fit.
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2014

In [14]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_14}"

#### Results Pages

In [15]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_14, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 26 || Women Pages: 8
Hamburg 2014 total results pages: 34
Example URLs: 
 https://hamburg.r.mikatiming.com/2014/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2014/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2014/Hamburg2014_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: d7e65d00aaa8e1e9


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (16695 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2014/Hamburg2014_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 10693,
 'downloader/request_count': 34,
 'downloader/request_method_count/GET': 34,
 'downloader/response_bytes': 1491463,
 'downloader/response_count': 34,
 'downloader/response_status_count/200': 34,
 'elapsed_time_seconds': 4.742699,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 14, 51, 44, 528264),
 'httpcompression/response_bytes': 18978711,
 'httpcompression/response_count': 34,
 'item_scraped_count': 16695,
 'log_count/INFO': 12,
 'memusage/max': 283951104,
 'memusage/startup': 283951104,
 'response_received_count': 34,
 'scheduler/dequeued': 34,
 'scheduler/dequeued/memory': 34,
 'scheduler/enqueued': 34,
 'scheduler/enqueued/memory': 34,
 's

In [34]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2014_res.csv")

In [35]:
df_ham_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,1002,MH/WH 1985-1994,M,02:23:48,999999074FFEDE0000234FE9
1,14608,M/W 35 1975-1979,M,03:35:52,999999074FFEDE00002365A9
2,7271,MH/WH 1985-1994,M,,999999074FFEDE000023499A
3,14842,M/W 50 1960-1964,M,03:52:59,000017074FFEE0000023B67B
4,14033,M/W 65 1945-1949,M,,999999074FFED9000022A2C0


In [36]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16695 entries, 0 to 16694
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   16695 non-null  object
 1   age_cat  16692 non-null  object
 2   gender   16695 non-null  object
 3   finish   12879 non-null  object
 4   idp      16695 non-null  object
dtypes: object(5)
memory usage: 652.3+ KB


#### Splits Pages

In [16]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2014_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_14, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2014 total splits pages: 16695
Example URLs: 
 https://hamburg.r.mikatiming.com/2014/?content=detail&fpid=search&pid=search&idp=999999074FFEDE0000234FE9&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2014/?content=detail&fpid=search&pid=search&idp=999999074FFEDC000022B75C&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2014/Hamburg2014_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 6963fe51491ca20b


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9099 pages (at 9099 pages/min), scraped 9075 items (at 9075 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (16695 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2014/Hamburg2014_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5993505,
 'downloader/request_count': 16695,
 'downloader/request_method_count/GET': 16695,
 'downloader/response_bytes': 94282336,
 'downloader/response_count': 16695,
 'downloader/response_status_count/200': 16695,
 'elapsed_time_seconds': 109.470714,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 14, 54, 3, 578974),
 'httpcompression/response_bytes': 301075434,
 'httpcompression/response_count': 16695,
 'item_scraped_count': 16695,
 'log_count/INFO': 13,
 'memusage/max': 366166016,
 'memusage/startup': 366166016,
 'response_received_count': 16695,
 'scheduler/de

In [39]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2014_splits.csv")

In [40]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16695 entries, 0 to 16694
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       16695 non-null  object
 1   k_5       16685 non-null  object
 2   k_10      16685 non-null  object
 3   k_15      16685 non-null  object
 4   k_20      16685 non-null  object
 5   k_half    16685 non-null  object
 6   k_25      16685 non-null  object
 7   k_30      16685 non-null  object
 8   k_35      16685 non-null  object
 9   k_40      16685 non-null  object
 10  k_finish  16685 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


#### Full Raw Dataset for Hamburg 2014

In [41]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2014_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2014_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [42]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16695 entries, 0 to 16694
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    16695 non-null  object
 1   age_cat   16692 non-null  object
 2   gender    16695 non-null  object
 3   finish    12879 non-null  object
 4   idp       16695 non-null  object
 5   k_5       16685 non-null  object
 6   k_10      16685 non-null  object
 7   k_15      16685 non-null  object
 8   k_20      16685 non-null  object
 9   k_half    16685 non-null  object
 10  k_25      16685 non-null  object
 11  k_30      16685 non-null  object
 12  k_35      16685 non-null  object
 13  k_40      16685 non-null  object
 14  k_finish  16685 non-null  object
dtypes: object(15)
memory usage: 1.9+ MB


In [43]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [44]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16695 entries, 0 to 16694
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          16695 non-null  object
 1   age_cat         16692 non-null  object
 2   gender          16695 non-null  object
 3   finish          12879 non-null  object
 4   idp             16695 non-null  object
 5   k_5_time        16685 non-null  object
 6   k_5_pace        16685 non-null  object
 7   k_5_speed       16685 non-null  object
 8   k_10_time       16685 non-null  object
 9   k_10_pace       16685 non-null  object
 10  k_10_speed      16685 non-null  object
 11  k_15_time       16685 non-null  object
 12  k_15_pace       16685 non-null  object
 13  k_15_speed      16685 non-null  object
 14  k_20_time       16685 non-null  object
 15  k_20_pace       16685 non-null  object
 16  k_20_speed      16685 non-null  object
 17  k_half_time     16685 non-null  object
 18  k_half

In [45]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,MH/WH 1985-1994,M,02:07:00,000017074FFEDE0000239E43,00:15:10,03:02,19.78,00:30:02,02:59,...,19.69,01:45:06,02:55,20.57,02:00:03,03:00,20.07,02:07:00,03:10,18.95
1,10,MH/WH 1985-1994,M,02:12:33,000017074FFEDE0000239E4C,00:15:11,03:03,19.76,00:30:06,02:59,...,19.72,01:45:31,03:00,20.02,02:03:08,03:32,17.03,02:12:33,04:18,13.99
2,10000,MH/WH 1985-1994,M,,999999074FFEDE0000234C33,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,10001,M/W 55 1955-1959,M,04:18:58,999999074FFEDE0000235DE7,00:27:12,05:27,11.03,00:55:15,05:37,...,9.80,03:23:01,06:48,8.84,04:00:34,07:31,7.99,04:18:58,08:23,7.16
4,10002,M/W 45 1965-1969,M,04:00:32,999999074FFED90000220A45,00:28:25,05:41,10.56,00:56:42,05:40,...,10.48,03:19:12,05:45,10.43,03:48:16,05:49,10.32,04:00:32,05:36,10.74


In [46]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2014_full.csv", index=False)

In [47]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2015

In [20]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_15}"

#### Races Pages

In [21]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_15, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 30 || Women Pages: 9
Hamburg 2015 total results pages: 39
Example URLs: 
 https://hamburg.r.mikatiming.com/2015/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2015/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2015/Hamburg2015_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 3276f3c632a94a64


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (19205 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2015/Hamburg2015_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 12267,
 'downloader/request_count': 39,
 'downloader/request_method_count/GET': 39,
 'downloader/response_bytes': 1730731,
 'downloader/response_count': 39,
 'downloader/response_status_count/200': 39,
 'elapsed_time_seconds': 5.831912,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 14, 56, 50, 226060),
 'httpcompression/response_bytes': 21856701,
 'httpcompression/response_count': 39,
 'item_scraped_count': 19205,
 'log_count/INFO': 12,
 'memusage/max': 366166016,
 'memusage/startup': 366166016,
 'response_received_count': 39,
 'scheduler/dequeued': 39,
 'scheduler/dequeued/memory': 39,
 'scheduler/enqueued': 39,
 'scheduler/enqueued/memory': 39,
 's

In [53]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2015_res.csv")

In [54]:
df_ham_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,15974,M/W 65 1946-1950,M,,999999074FFEDF000023C250
1,10576,M/W 50 1961-1965,M,,999999074FFEE100002562BF
2,8042,M/W 50 1961-1965,M,03:56:00,999999074FFEE100002502B4
3,14749,M/W 35 1976-1980,M,04:17:50,999999074FFEE1000024DA7F
4,15685,M/W 55 1956-1960,M,04:34:33,999999074FFEE10000255C15


In [55]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19205 entries, 0 to 19204
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   19205 non-null  object
 1   age_cat  19203 non-null  object
 2   gender   19205 non-null  object
 3   finish   14765 non-null  object
 4   idp      19205 non-null  object
dtypes: object(5)
memory usage: 750.3+ KB


#### Splits Pages

In [22]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2015_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_15, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2015 total splits pages: 19205
Example URLs: 
 https://hamburg.r.mikatiming.com/2015/?content=detail&fpid=search&pid=search&idp=999999074FFEDF000023C250&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2015/?content=detail&fpid=search&pid=search&idp=999999074FFEE10000246911&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2015/Hamburg2015_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: be7ecc115b11cdad
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 10028 pages (at 10028 pages/min), scraped 10007 items (at 10007 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (19205 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2015/Hamburg2015_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 6894595,
 'downloader/request_count': 19205,
 'downloader/request_method_count/GET': 19205,
 'downloader/response_bytes': 108236583,
 'downloader/response_count': 19205,
 'downloader/response_status_count/200': 19205,
 'elapsed_time_seconds': 114.87762,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 14, 59, 8, 708776),
 'httpcompression/response_bytes': 346165556,
 'httpcompression/response_count': 19205,
 'item_scraped_count': 19205,
 'log_count/INFO': 13,
 'memusage/max': 413040640,
 'memusage/startup': 413040640,
 'response_received_count': 19205,
 'schedule

In [58]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2015_splits.csv")

In [59]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19205 entries, 0 to 19204
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       19205 non-null  object
 1   k_5       19193 non-null  object
 2   k_10      19193 non-null  object
 3   k_15      19193 non-null  object
 4   k_20      19193 non-null  object
 5   k_half    19193 non-null  object
 6   k_25      19193 non-null  object
 7   k_30      19193 non-null  object
 8   k_35      19193 non-null  object
 9   k_40      19193 non-null  object
 10  k_finish  19193 non-null  object
dtypes: object(11)
memory usage: 1.6+ MB


#### Full Raw Dataset for Hamburg 2015

In [60]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2015_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2015_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [61]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19205 entries, 0 to 19204
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    19205 non-null  object
 1   age_cat   19203 non-null  object
 2   gender    19205 non-null  object
 3   finish    14765 non-null  object
 4   idp       19205 non-null  object
 5   k_5       19193 non-null  object
 6   k_10      19193 non-null  object
 7   k_15      19193 non-null  object
 8   k_20      19193 non-null  object
 9   k_half    19193 non-null  object
 10  k_25      19193 non-null  object
 11  k_30      19193 non-null  object
 12  k_35      19193 non-null  object
 13  k_40      19193 non-null  object
 14  k_finish  19193 non-null  object
dtypes: object(15)
memory usage: 2.2+ MB


In [62]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [63]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19205 entries, 0 to 19204
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          19205 non-null  object
 1   age_cat         19203 non-null  object
 2   gender          19205 non-null  object
 3   finish          14765 non-null  object
 4   idp             19205 non-null  object
 5   k_5_time        19193 non-null  object
 6   k_5_pace        19193 non-null  object
 7   k_5_speed       19193 non-null  object
 8   k_10_time       19193 non-null  object
 9   k_10_pace       19193 non-null  object
 10  k_10_speed      19193 non-null  object
 11  k_15_time       19193 non-null  object
 12  k_15_pace       19193 non-null  object
 13  k_15_speed      19193 non-null  object
 14  k_20_time       19193 non-null  object
 15  k_20_pace       19193 non-null  object
 16  k_20_speed      19193 non-null  object
 17  k_half_time     19193 non-null  object
 18  k_half

In [64]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M/W 30 1981-1985,M,,000000074FFEE4000025747D,00:15:07,03:02,19.85,00:30:07,03:00,...,-,-,-,-,-,-,-,-,-,-
1,10,M/W 20 1986-1995,M,02:09:34,000000074FFEE40000257486,00:15:08,03:02,19.82,00:30:08,03:00,...,19.87,01:45:51,03:02,19.85,02:01:52,03:13,18.73,02:09:34,03:31,17.10
2,10000,M/W 75 1936-1940,M,05:13:58,000017074FFEDF000023D30D,00:32:43,06:33,9.17,01:07:32,06:58,...,7.95,04:17:36,08:50,6.80,04:56:39,07:49,7.68,05:13:58,07:54,7.61
3,10001,M/W 40 1971-1975,M,03:39:18,999999074FFEE1000024D009,00:27:07,05:26,11.06,00:53:22,05:15,...,11.52,03:02:16,05:01,11.97,03:28:00,05:09,11.66,03:39:18,05:09,11.65
4,10002,M/W 45 1966-1970,M,03:47:28,999999074FFEE1000025533C,00:27:41,05:33,10.84,00:54:20,05:20,...,11.12,03:08:41,05:27,11.02,03:35:46,05:25,11.08,03:47:28,05:20,11.26


In [65]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2015_full.csv", index=False)

In [66]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2016

In [23]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_16}"

#### Results Pages

In [14]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_16, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 25 || Women Pages: 8
Hamburg 2016 total results pages: 33
Example URLs: 
 https://hamburg.r.mikatiming.com/2016/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2016/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2016/Hamburg2016_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: cf247082f63fceb9


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (16011 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2016/Hamburg2016_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 10378,
 'downloader/request_count': 33,
 'downloader/request_method_count/GET': 33,
 'downloader/response_bytes': 1432483,
 'downloader/response_count': 33,
 'downloader/response_status_count/200': 33,
 'elapsed_time_seconds': 5.696712,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 15, 0, 26, 249977),
 'httpcompression/response_bytes': 18395525,
 'httpcompression/response_count': 33,
 'item_scraped_count': 16011,
 'log_count/INFO': 12,
 'memusage/max': 413040640,
 'memusage/startup': 413040640,
 'response_received_count': 33,
 'scheduler/dequeued': 33,
 'scheduler/dequeued/memory': 33,
 'scheduler/enqueued': 33,
 'scheduler/enqueued/memory': 33,
 'st

In [71]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2016_res.csv")

In [72]:
df_ham_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,12404,M/W 50 1962-1966,M,03:56:29,999999074FFEE7000025C772
1,14448,M/W 60 1952-1956,M,04:51:31,999999074FFEEF0000271C65
2,1334,M/W 50 1962-1966,M,02:59:25,999999074FFEEF00002719BE
3,16656,M/W 20 1987-1996,M,02:43:56,000017074FFEF200002772B9
4,16701,M/W 50 1962-1966,M,04:47:58,000017074FFEF20000277076


In [73]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16011 entries, 0 to 16010
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   16011 non-null  object
 1   age_cat  16008 non-null  object
 2   gender   16011 non-null  object
 3   finish   12104 non-null  object
 4   idp      16011 non-null  object
dtypes: object(5)
memory usage: 625.6+ KB


#### Splits Pages

In [24]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2016_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_16, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2016 total splits pages: 16011
Example URLs: 
 https://hamburg.r.mikatiming.com/2016/?content=detail&fpid=search&pid=search&idp=999999074FFEE7000025C772&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2016/?content=detail&fpid=search&pid=search&idp=999999074FFEE700002612CA&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2016/Hamburg2016_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 285da57c11fcd601


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9443 pages (at 9443 pages/min), scraped 9415 items (at 9415 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (16011 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2016/Hamburg2016_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5747949,
 'downloader/request_count': 16011,
 'downloader/request_method_count/GET': 16011,
 'downloader/response_bytes': 115565210,
 'downloader/response_count': 16011,
 'downloader/response_status_count/200': 16011,
 'elapsed_time_seconds': 100.933372,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 15, 2, 23, 574613),
 'httpcompression/response_bytes': 403521387,
 'httpcompression/response_count': 16011,
 'item_scraped_count': 16011,
 'log_count/INFO': 13,
 'memusage/max': 444153856,
 'memusage/startup': 444022784,
 'response_received_count': 16011,
 'scheduler/d

In [76]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2016_splits.csv")

In [77]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16011 entries, 0 to 16010
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       16011 non-null  object
 1   k_5       15986 non-null  object
 2   k_10      15986 non-null  object
 3   k_15      15986 non-null  object
 4   k_20      15986 non-null  object
 5   k_half    15986 non-null  object
 6   k_25      15986 non-null  object
 7   k_30      15986 non-null  object
 8   k_35      15986 non-null  object
 9   k_40      15986 non-null  object
 10  k_finish  15986 non-null  object
dtypes: object(11)
memory usage: 1.3+ MB


#### Full Raw Dataset for Hamburg 2016

In [78]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2016_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2016_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [79]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16011 entries, 0 to 16010
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    16011 non-null  object
 1   age_cat   16008 non-null  object
 2   gender    16011 non-null  object
 3   finish    12104 non-null  object
 4   idp       16011 non-null  object
 5   k_5       15986 non-null  object
 6   k_10      15986 non-null  object
 7   k_15      15986 non-null  object
 8   k_20      15986 non-null  object
 9   k_half    15986 non-null  object
 10  k_25      15986 non-null  object
 11  k_30      15986 non-null  object
 12  k_35      15986 non-null  object
 13  k_40      15986 non-null  object
 14  k_finish  15986 non-null  object
dtypes: object(15)
memory usage: 1.8+ MB


In [80]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [81]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16011 entries, 0 to 16010
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          16011 non-null  object
 1   age_cat         16008 non-null  object
 2   gender          16011 non-null  object
 3   finish          12104 non-null  object
 4   idp             16011 non-null  object
 5   k_5_time        15986 non-null  object
 6   k_5_pace        15986 non-null  object
 7   k_5_speed       15986 non-null  object
 8   k_10_time       15986 non-null  object
 9   k_10_pace       15986 non-null  object
 10  k_10_speed      15986 non-null  object
 11  k_15_time       15986 non-null  object
 12  k_15_pace       15986 non-null  object
 13  k_15_speed      15986 non-null  object
 14  k_20_time       15986 non-null  object
 15  k_20_pace       15986 non-null  object
 16  k_20_speed      15986 non-null  object
 17  k_half_time     15986 non-null  object
 18  k_half

In [82]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M/W 20 1987-1996,M,02:06:58,000017074FFEF20000275211,00:15:00,03:00,20.00,00:29:35,02:55,...,20.04,01:44:32,03:01,19.93,01:59:57,03:05,19.46,02:06:58,03:12,18.77
1,10,M/W 30 1982-1986,M,02:12:21,000017074FFEF2000027522D,00:15:02,03:01,19.96,00:29:35,02:55,...,19.63,01:45:57,03:14,18.58,02:03:39,03:33,16.95,02:12:21,03:58,15.14
2,10001,M/W 50 1962-1966,M,03:45:53,999999074FFEEE000026282D,00:23:54,04:47,12.55,00:47:19,04:41,...,10.96,02:57:13,05:55,10.15,03:30:57,06:45,8.89,03:45:53,06:49,8.82
3,10002,M/W 45 1967-1971,M,03:58:26,999999074FFEEE00002628D7,00:27:30,05:30,10.91,00:55:28,05:36,...,10.40,03:18:00,05:40,10.59,03:46:26,05:42,10.55,03:58:26,05:29,10.97
4,10003,M/W 40 1972-1976,M,,999999074FFEEE0000262A37,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [83]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2016_full.csv", index=False)

In [84]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2017

In [25]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_17}"

#### Results Pages

In [26]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_17, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 24 || Women Pages: 8
Hamburg 2017 total results pages: 32
Example URLs: 
 https://hamburg.r.mikatiming.com/2017/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2017/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2017/Hamburg2017_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: a2654118d93d4f40


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (15638 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2017/Hamburg2017_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 10383,
 'downloader/request_count': 32,
 'downloader/request_method_count/GET': 32,
 'downloader/response_bytes': 1420738,
 'downloader/response_count': 32,
 'downloader/response_status_count/200': 32,
 'elapsed_time_seconds': 4.753375,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 15, 22, 21, 977954),
 'httpcompression/response_bytes': 18117570,
 'httpcompression/response_count': 32,
 'item_scraped_count': 15638,
 'log_count/INFO': 12,
 'memusage/max': 444153856,
 'memusage/startup': 444153856,
 'response_received_count': 32,
 'scheduler/dequeued': 32,
 'scheduler/dequeued/memory': 32,
 'scheduler/enqueued': 32,
 'scheduler/enqueued/memory': 32,
 's

In [109]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2017_res.csv")

In [119]:
df_ham_res.tail()

Unnamed: 0,run_no,age_cat,gender,finish,idp
15633,F6019,M/W 45: 1968-1972,W,04:47:50,999999074FFF03000028E9CD
15634,F6123,M/W 55: 1958-1962,W,05:27:20,999999074FFF030000290719
15635,F6084,MH / WH 1988-1999 (Hauptklasse),W,05:22:31,999999074FFF030000295999
15636,F5066,M/W 35: 1978-1982,W,04:38:26,999999074FFEF7000027FF5D
15637,F5379,MH / WH 1988-1999 (Hauptklasse),W,,999999074FFEF70000278666


In [115]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15638 entries, 0 to 15637
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   15457 non-null  object
 1   age_cat  15630 non-null  object
 2   gender   15638 non-null  object
 3   finish   11955 non-null  object
 4   idp      15638 non-null  object
dtypes: object(5)
memory usage: 611.0+ KB


#### Splits Pages

In [27]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2017_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_17, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2017 total splits pages: 15638
Example URLs: 
 https://hamburg.r.mikatiming.com/2017/?content=detail&fpid=search&pid=search&idp=999999074FFEF70000278197&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2017/?content=detail&fpid=search&pid=search&idp=999999074FFEF70000278666&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2017/Hamburg2017_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1317, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 4e2951f1ec3cb573


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9282 pages (at 9282 pages/min), scraped 9265 items (at 9265 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (15638 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2017/Hamburg2017_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5614042,
 'downloader/request_count': 15638,
 'downloader/request_method_count/GET': 15638,
 'downloader/response_bytes': 116884481,
 'downloader/response_count': 15638,
 'downloader/response_status_count/200': 15638,
 'elapsed_time_seconds': 99.589103,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 15, 29, 10, 846146),
 'httpcompression/response_bytes': 410100137,
 'httpcompression/response_count': 15638,
 'item_scraped_count': 15638,
 'log_count/INFO': 13,
 'memusage/max': 444153856,
 'memusage/startup': 444153856,
 'response_received_count': 15638,
 'scheduler/d

In [121]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2017_splits.csv")

In [122]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15638 entries, 0 to 15637
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       15638 non-null  object
 1   k_5       15616 non-null  object
 2   k_10      15616 non-null  object
 3   k_15      15616 non-null  object
 4   k_20      15616 non-null  object
 5   k_half    15616 non-null  object
 6   k_25      15616 non-null  object
 7   k_30      15616 non-null  object
 8   k_35      15616 non-null  object
 9   k_40      15616 non-null  object
 10  k_finish  15616 non-null  object
dtypes: object(11)
memory usage: 1.3+ MB


#### Full Raw Dataset for Hamburg 2017

In [123]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2017_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2017_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [124]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15638 entries, 0 to 15637
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    15457 non-null  object
 1   age_cat   15630 non-null  object
 2   gender    15638 non-null  object
 3   finish    11955 non-null  object
 4   idp       15638 non-null  object
 5   k_5       15616 non-null  object
 6   k_10      15616 non-null  object
 7   k_15      15616 non-null  object
 8   k_20      15616 non-null  object
 9   k_half    15616 non-null  object
 10  k_25      15616 non-null  object
 11  k_30      15616 non-null  object
 12  k_35      15616 non-null  object
 13  k_40      15616 non-null  object
 14  k_finish  15616 non-null  object
dtypes: object(15)
memory usage: 1.8+ MB


In [125]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [126]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15638 entries, 0 to 15637
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          15457 non-null  object
 1   age_cat         15630 non-null  object
 2   gender          15638 non-null  object
 3   finish          11955 non-null  object
 4   idp             15638 non-null  object
 5   k_5_time        15616 non-null  object
 6   k_5_pace        15616 non-null  object
 7   k_5_speed       15616 non-null  object
 8   k_10_time       15616 non-null  object
 9   k_10_pace       15616 non-null  object
 10  k_10_speed      15616 non-null  object
 11  k_15_time       15616 non-null  object
 12  k_15_pace       15616 non-null  object
 13  k_15_speed      15616 non-null  object
 14  k_20_time       15616 non-null  object
 15  k_20_pace       15616 non-null  object
 16  k_20_speed      15616 non-null  object
 17  k_half_time     15616 non-null  object
 18  k_half

In [127]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,MH / WH 1988-1999 (Hauptklasse),M,02:07:31,00001715DBE17500002B8C7B,00:15:03,03:01,19.93,00:30:07,03:01,...,19.50,01:45:45,03:04,19.59,02:00:46,03:01,19.98,02:07:31,03:05,19.51
1,10,MH / WH 1988-1999 (Hauptklasse),M,,00001715DBE17500002B8C84,00:15:04,03:01,19.91,00:30:09,03:01,...,-,-,-,-,-,-,-,-,-,-
2,10001,M/W 45: 1968-1972,M,,999999074FFF020000282F09,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,10002,M/W 45: 1968-1972,M,04:46:46,999999074FFEF7000027E735,00:29:59,06:00,10.01,01:00:05,06:02,...,8.54,03:50:40,08:08,7.38,04:30:00,07:52,7.63,04:46:46,07:39,7.85
4,10003,M/W 55: 1958-1962,M,03:56:23,999999074FFF03000028B165,00:27:14,05:27,11.02,00:54:06,05:23,...,11.05,03:14:08,05:41,10.58,03:43:55,05:58,10.07,03:56:23,05:41,10.56


In [128]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2017_full.csv", index=False)

In [129]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2018

In [28]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_18}"

#### Results Pages

In [29]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_18, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 22 || Women Pages: 7
Hamburg 2018 total results pages: 29
Example URLs: 
 https://hamburg.r.mikatiming.com/2018/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2018/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2018/Hamburg2018_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: fa7fa26e6b487757
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (14010 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2018/Hamburg2018_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9409,
 'downloader/request_count': 29,
 'downloader/request_method_count/GET': 29,
 'downloader/response_bytes': 1705872,
 'downloader/response_count': 29,
 'downloader/response_status_count/200': 29,
 'elapsed_time_seconds': 14.155349,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 15, 35, 38, 99133),
 'httpcompression/response_bytes': 31830117,
 'httpcompression/response_count': 29,
 'item_scraped_count': 14010,
 'log_count/INFO': 12,
 'memusage/max': 444153856,
 'memusage/startup': 444153856,
 'response_received_count': 29,
 'scheduler/dequeued': 29,
 'scheduler/dequeued/memory': 29,
 'scheduler/enqueued': 29,
 'scheduler/enqueued/memory': 29,
 'st

In [134]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2018_res.csv")

In [137]:
df_ham_res.tail()

Unnamed: 0,run_no,age_cat,gender,finish,idp
14005,F2187,MH / WH 1989-2000 (Hauptklasse),W,03:49:16,99999915DBE17800002C443D
14006,F4298,M / W 50 1964-1968,W,04:48:51,99999915DBE17C00002E1287
14007,F2021,M / W 40 1974-1978,W,03:50:34,99999915DBE17C00002E0C4D
14008,F5295,M / W 45 1969-1973,W,04:44:20,99999915DBE17C00002DFDD2
14009,F5163,M / W 30 1984-1988,W,04:40:05,99999915DBE17C00002E2BC0


In [136]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14010 entries, 0 to 14009
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   13985 non-null  object
 1   age_cat  14008 non-null  object
 2   gender   14010 non-null  object
 3   finish   10057 non-null  object
 4   idp      14010 non-null  object
dtypes: object(5)
memory usage: 547.4+ KB


#### Splits Pages

In [30]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2018_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_18, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2018 total splits pages: 14010
Example URLs: 
 https://hamburg.r.mikatiming.com/2018/?content=detail&fpid=search&pid=search&idp=99999915DBE17C00002DF57B&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2018/?content=detail&fpid=search&pid=search&idp=99999915DBE17C00002E2BC0&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2018/Hamburg2018_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [139]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 28b32774a6d0451c


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 6935 pages (at 6935 pages/min), scraped 6922 items (at 6922 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (14010 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2018/Hamburg2018_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5029590,
 'downloader/request_count': 14010,
 'downloader/request_method_count/GET': 14010,
 'downloader/response_bytes': 173315016,
 'downloader/response_count': 14010,
 'downloader/response_status_count/200': 14010,
 'elapsed_time_seconds': 115.309668,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 14, 15, 38, 2, 880906),
 'httpcompression/response_bytes': 788216370,
 'httpcompression/response_count': 14010,
 'item_scraped_count': 14010,
 'log_count/INFO': 13,
 'memusage/max': 444153856,
 'memusage/startup': 444153856,
 'response_received_count': 14010,
 'scheduler/d

In [140]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2018_splits.csv")

In [141]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14010 entries, 0 to 14009
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       14010 non-null  object
 1   k_5       13986 non-null  object
 2   k_10      13986 non-null  object
 3   k_15      13986 non-null  object
 4   k_20      13986 non-null  object
 5   k_half    13986 non-null  object
 6   k_25      13986 non-null  object
 7   k_30      13986 non-null  object
 8   k_35      13986 non-null  object
 9   k_40      13986 non-null  object
 10  k_finish  13986 non-null  object
dtypes: object(11)
memory usage: 1.2+ MB


#### Full Raw Dataset for Hamburg 2018

In [142]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2018_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2018_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [144]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14010 entries, 0 to 14009
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    13985 non-null  object
 1   age_cat   14008 non-null  object
 2   gender    14010 non-null  object
 3   finish    10057 non-null  object
 4   idp       14010 non-null  object
 5   k_5       13986 non-null  object
 6   k_10      13986 non-null  object
 7   k_15      13986 non-null  object
 8   k_20      13986 non-null  object
 9   k_half    13986 non-null  object
 10  k_25      13986 non-null  object
 11  k_30      13986 non-null  object
 12  k_35      13986 non-null  object
 13  k_40      13986 non-null  object
 14  k_finish  13986 non-null  object
dtypes: object(15)
memory usage: 1.6+ MB


In [145]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [146]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14010 entries, 0 to 14009
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          13985 non-null  object
 1   age_cat         14008 non-null  object
 2   gender          14010 non-null  object
 3   finish          10057 non-null  object
 4   idp             14010 non-null  object
 5   k_5_time        13986 non-null  object
 6   k_5_pace        13986 non-null  object
 7   k_5_speed       13986 non-null  object
 8   k_10_time       13986 non-null  object
 9   k_10_pace       13986 non-null  object
 10  k_10_speed      13986 non-null  object
 11  k_15_time       13986 non-null  object
 12  k_15_pace       13986 non-null  object
 13  k_15_speed      13986 non-null  object
 14  k_20_time       13986 non-null  object
 15  k_20_pace       13986 non-null  object
 16  k_20_speed      13986 non-null  object
 17  k_half_time     13986 non-null  object
 18  k_half

In [147]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M / W 30 1984-1988,M,02:11:57,00001715DBE17E00002EC0DE,00:15:09,03:02,19.80,00:30:07,03:00,...,19.31,01:47:11,03:14,18.63,02:04:16,03:25,17.56,02:11:57,03:31,17.14
1,10,MH / WH 1989-2000 (Hauptklasse),M,,00001715DBE17E00002EC0E6,00:15:09,03:02,19.80,00:30:08,03:00,...,-,-,-,-,-,-,-,-,-,-
2,10001,M / W 30 1984-1988,M,,99999915DBE17C00002DFDE2,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,10002,M / W 40 1974-1978,M,04:08:06,99999915DBE17C00002E3561,00:29:56,06:00,10.02,00:59:12,05:52,...,10.38,03:22:48,05:53,10.20,03:54:30,06:21,9.46,04:08:06,06:12,9.68
4,10003,MH / WH 1989-2000 (Hauptklasse),M,,99999915DBE17C00002DF57F,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [148]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2018_full.csv", index=False)

In [149]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2019

In [6]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_19}"

#### Results Pages

In [27]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_19, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 21 || Women Pages: 7
Hamburg 2019 total results pages: 28
Example URLs: 
 https://hamburg.r.mikatiming.com/2019/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2019/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2019/Hamburg2019_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: d3c27791a164dad5


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (13498 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2019/Hamburg2019_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9084,
 'downloader/request_count': 28,
 'downloader/request_method_count/GET': 28,
 'downloader/response_bytes': 1622522,
 'downloader/response_count': 28,
 'downloader/response_status_count/200': 28,
 'elapsed_time_seconds': 11.211847,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 19, 44, 196145),
 'httpcompression/response_bytes': 30527892,
 'httpcompression/response_count': 28,
 'item_scraped_count': 13498,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 28,
 'scheduler/dequeued': 28,
 'scheduler/dequeued/memory': 28,
 'scheduler/enqueued': 28,
 'scheduler/enqueued/memory': 28,
 's

In [12]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2019_res.csv")

In [13]:
df_ham_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,9600,M / W 45 1970-1974,M,04:26:50,99999915DBE17F00002E91EF
1,15467,M / W 30 1985-1989,M,04:32:17,99999915DBE187000030C6D5
2,39,M / W 35 1980-1984,M,02:19:14,00001715DBE18B000030C816
3,34,M / W 30 1985-1989,M,02:13:34,00001715DBE18B000030C811
4,2196,M / W 40 1975-1979,M,03:20:20,99999915DBE18600002FD0FD


In [14]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13498 entries, 0 to 13497
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   13483 non-null  object
 1   age_cat  13483 non-null  object
 2   gender   13498 non-null  object
 3   finish   10143 non-null  object
 4   idp      13498 non-null  object
dtypes: object(5)
memory usage: 527.4+ KB


#### Splits Pages

In [7]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2019_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_19, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2019 total splits pages: 13498
Example URLs: 
 https://hamburg.r.mikatiming.com/2019/?content=detail&fpid=search&pid=search&idp=99999915DBE17F00002E91EF&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2019/?content=detail&fpid=search&pid=search&idp=99999915DBE186000030A00D&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2019/Hamburg2019_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 785b00269a0659d4
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 7540 pages (at 7540 pages/min), scraped 7521 items (at 7521 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (13498 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2019/Hamburg2019_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4845782,
 'downloader/request_count': 13498,
 'downloader/request_method_count/GET': 13498,
 'downloader/response_bytes': 152790160,
 'downloader/response_count': 13498,
 'downloader/response_status_count/200': 13498,
 'elapsed_time_seconds': 107.319624,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 8, 18, 0, 40, 23, 812352),
 'httpcompression/response_bytes': 684869321,
 'httpcompression/response_count': 13498,
 'item_scraped_count': 13498,
 'log_count/INFO': 13,
 'memusage/max': 208076800,
 'memusage/startup': 208076800,
 'response_received_count': 13498,
 'scheduler/d

In [11]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2019_splits.csv")

In [12]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13498 entries, 0 to 13497
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       13498 non-null  object
 1   k_5       13457 non-null  object
 2   k_10      13457 non-null  object
 3   k_15      13457 non-null  object
 4   k_20      13457 non-null  object
 5   k_half    13457 non-null  object
 6   k_25      13457 non-null  object
 7   k_30      13457 non-null  object
 8   k_35      13457 non-null  object
 9   k_40      13457 non-null  object
 10  k_finish  13457 non-null  object
dtypes: object(11)
memory usage: 1.1+ MB


#### Full Raw Dataset for Hamburg 2019

In [13]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2019_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2019_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [14]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13498 entries, 0 to 13497
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    13483 non-null  object
 1   age_cat   13483 non-null  object
 2   gender    13498 non-null  object
 3   finish    10143 non-null  object
 4   idp       13498 non-null  object
 5   k_5       13457 non-null  object
 6   k_10      13457 non-null  object
 7   k_15      13457 non-null  object
 8   k_20      13457 non-null  object
 9   k_half    13457 non-null  object
 10  k_25      13457 non-null  object
 11  k_30      13457 non-null  object
 12  k_35      13457 non-null  object
 13  k_40      13457 non-null  object
 14  k_finish  13457 non-null  object
dtypes: object(15)
memory usage: 1.5+ MB


In [15]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [16]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13498 entries, 0 to 13497
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          13483 non-null  object
 1   age_cat         13483 non-null  object
 2   gender          13498 non-null  object
 3   finish          10143 non-null  object
 4   idp             13498 non-null  object
 5   k_5_time        13457 non-null  object
 6   k_5_pace        13457 non-null  object
 7   k_5_speed       13457 non-null  object
 8   k_10_time       13457 non-null  object
 9   k_10_pace       13457 non-null  object
 10  k_10_speed      13457 non-null  object
 11  k_15_time       13457 non-null  object
 12  k_15_pace       13457 non-null  object
 13  k_15_speed      13457 non-null  object
 14  k_20_time       13457 non-null  object
 15  k_20_pace       13457 non-null  object
 16  k_20_speed      13457 non-null  object
 17  k_half_time     13457 non-null  object
 18  k_half

In [17]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M / W 35 1980-1984,M,,00001715DBE18B000030C7F1,-,-,-,'00:29:54',-,...,-,-,-,-,-,-,-,-,-,-
1,10,MH / WH 1990-2001 (Hauptklasse),M,02:09:59,00001715DBE18B000030C7FA,00:15:10,03:02,19.78,00:29:56,02:58,...,19.33,01:46:16,03:06,19.42,02:02:32,03:16,18.44,02:09:59,03:24,17.68
2,10001,M / W 45 1970-1974,M,,99999915DBE18600002FE047,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,10002,M / W 65 1950-1954,M,04:13:13,99999915DBE18600002FD981,00:29:16,05:52,10.25,00:58:09,05:47,...,10.14,03:26:37,06:09,9.77,03:58:57,06:28,9.28,04:13:13,06:30,9.23
4,10003,M / W 50 1965-1969,M,03:59:02,99999915DBE186000030B048,00:29:11,05:51,10.28,00:56:16,05:25,...,10.32,03:18:54,05:39,10.63,03:46:20,05:30,10.94,03:59:02,05:48,10.37


In [18]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2019_full.csv", index=False)

In [19]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2022

In [20]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_22}"

#### Results Pages

In [21]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_22, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 5
Hamburg 2022 total results pages: 22
Example URLs: 
 https://hamburg.r.mikatiming.com/2022/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2022/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2022/Hamburg2022_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: e4504f0e0cd9939e
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (10416 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2022/Hamburg2022_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 7136,
 'downloader/request_count': 22,
 'downloader/request_method_count/GET': 22,
 'downloader/response_bytes': 1146480,
 'downloader/response_count': 22,
 'downloader/response_status_count/200': 22,
 'elapsed_time_seconds': 11.079294,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 8, 18, 0, 42, 22, 529896),
 'httpcompression/response_bytes': 20575301,
 'httpcompression/response_count': 22,
 'item_scraped_count': 10416,
 'log_count/INFO': 12,
 'memusage/max': 243810304,
 'memusage/startup': 243810304,
 'response_received_count': 22,
 'scheduler/dequeued': 22,
 'scheduler/dequeued/memory': 22,
 'scheduler/enqueued': 22,
 'scheduler/enqueued/memory': 22,
 'st

In [23]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2022_res.csv")

In [24]:
df_ham_res.tail()

Unnamed: 0,run_no,age_cat,gender,finish,idp
10411,F3394,MH/WH 1993-2004 (Hauptklasse),W,04:32:15,HCHSK2IQ3550F1
10412,F5263,M/W 55 1963-1967,W,05:12:22,HCHSK2IQ3550F3
10413,F4329,M/W 40 1978-1982,W,,99999915DBE18F000034E8DD
10414,F4376,M/W 40 1978-1982,W,04:30:22,99999915DBE18F000034405F
10415,F4097,M/W 55 1963-1967,W,04:42:56,HCHSK2IQ35442F


In [25]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   10415 non-null  object
 1   age_cat  10362 non-null  object
 2   gender   10416 non-null  object
 3   finish   6651 non-null   object
 4   idp      10416 non-null  object
dtypes: object(5)
memory usage: 407.0+ KB


#### Splits Pages

In [26]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2022_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_22, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2022 total splits pages: 10416
Example URLs: 
 https://hamburg.r.mikatiming.com/2022/?content=detail&fpid=search&pid=search&idp=99999915DBE18F000034D99F&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2022/?content=detail&fpid=search&pid=search&idp=HCHSK2IQ35442F&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2022/Hamburg2022_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 4ee3991b80cc175f
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8062 pages (at 8062 pages/min), scraped 8047 items (at 8047 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (10416 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2022/Hamburg2022_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3664934,
 'downloader/request_count': 10416,
 'downloader/request_method_count/GET': 10416,
 'downloader/response_bytes': 83984358,
 'downloader/response_count': 10416,
 'downloader/response_status_count/200': 10416,
 'elapsed_time_seconds': 78.240766,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 8, 18, 0, 44, 22, 622566),
 'httpcompression/response_bytes': 299744696,
 'httpcompression/response_count': 10416,
 'item_scraped_count': 10416,
 'log_count/INFO': 13,
 'memusage/max': 454606848,
 'memusage/startup': 454606848,
 'response_received_count': 10416,
 'scheduler/deq

In [28]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2022_splits.csv")

In [29]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       10416 non-null  object
 1   k_5       10397 non-null  object
 2   k_10      10397 non-null  object
 3   k_15      10397 non-null  object
 4   k_20      10397 non-null  object
 5   k_half    10397 non-null  object
 6   k_25      10397 non-null  object
 7   k_30      10397 non-null  object
 8   k_35      10397 non-null  object
 9   k_40      10397 non-null  object
 10  k_finish  10397 non-null  object
dtypes: object(11)
memory usage: 895.2+ KB


#### Full Raw Dataset for Hamburg 2022

In [30]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2022_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2022_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [31]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    10415 non-null  object
 1   age_cat   10362 non-null  object
 2   gender    10416 non-null  object
 3   finish    6651 non-null   object
 4   idp       10416 non-null  object
 5   k_5       10397 non-null  object
 6   k_10      10397 non-null  object
 7   k_15      10397 non-null  object
 8   k_20      10397 non-null  object
 9   k_half    10397 non-null  object
 10  k_25      10397 non-null  object
 11  k_30      10397 non-null  object
 12  k_35      10397 non-null  object
 13  k_40      10397 non-null  object
 14  k_finish  10397 non-null  object
dtypes: object(15)
memory usage: 1.2+ MB


In [32]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [33]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10416 entries, 0 to 10415
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          10415 non-null  object
 1   age_cat         10362 non-null  object
 2   gender          10416 non-null  object
 3   finish          6651 non-null   object
 4   idp             10416 non-null  object
 5   k_5_time        10397 non-null  object
 6   k_5_pace        10397 non-null  object
 7   k_5_speed       10397 non-null  object
 8   k_10_time       10397 non-null  object
 9   k_10_pace       10397 non-null  object
 10  k_10_speed      10397 non-null  object
 11  k_15_time       10397 non-null  object
 12  k_15_pace       10397 non-null  object
 13  k_15_speed      10397 non-null  object
 14  k_20_time       10397 non-null  object
 15  k_20_pace       10397 non-null  object
 16  k_20_speed      10397 non-null  object
 17  k_half_time     10397 non-null  object
 18  k_half

In [34]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M/W 35 1983-1987,M,02:06:05,HCHSK2IQ36010B,00:14:50,02:58,20.22,00:29:30,02:56,...,20.22,01:43:39,02:55,20.64,01:59:00,03:05,19.54,02:06:05,03:14,18.59
1,10,,M,,HCHSK2IQ360114,00:14:52,02:59,20.18,00:29:33,02:57,...,19.21,-,-,-,-,-,-,-,-,-
2,10001,M/W 30 1988-1992,M,,99999915DBE18F0000347B7B,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,10002,MH/WH 1993-2004 (Hauptklasse),M,,99999915DBE18F000034417D,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,10003,M/W 55 1963-1967,M,,99999915DBE18F000034419B,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [35]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2022_full.csv", index=False)

In [36]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

### 2023

In [37]:
ham_data_path = f"Marathons_Data/Raw/Hamburg/Hamburg{YEAR_23}"

#### Results Pages

In [38]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
ham_pages_urls, ham_res_settings = hamburg.gen_res_scrap_info(YEAR_23, HAM_NUM_RESULTS, HAM_RES_FIELDS, 
                                                              ham_data_path, show_settings=True)

Men Pages: 18 || Women Pages: 6
Hamburg 2023 total results pages: 24
Example URLs: 
 https://hamburg.r.mikatiming.com/2023/?pid=search&page=1&sex=M&num_results=500&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2023/?pid=search&page=1&sex=W&num_results=500&event=HML&event_main_group=custom.meeting.marathon
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2023/Hamburg2023_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_pages_urls, settings=ham_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 724fee20e546fa48


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (11757 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2023/Hamburg2023_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 7785,
 'downloader/request_count': 24,
 'downloader/request_method_count/GET': 24,
 'downloader/response_bytes': 1234892,
 'downloader/response_count': 24,
 'downloader/response_status_count/200': 24,
 'elapsed_time_seconds': 3.520092,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 8, 18, 0, 45, 3, 537562),
 'httpcompression/response_bytes': 22702038,
 'httpcompression/response_count': 24,
 'item_scraped_count': 11757,
 'log_count/INFO': 12,
 'memusage/max': 454606848,
 'memusage/startup': 454606848,
 'response_received_count': 24,
 'scheduler/dequeued': 24,
 'scheduler/dequeued/memory': 24,
 'scheduler/enqueued': 24,
 'scheduler/enqueued/memory': 24,
 'star

In [40]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2023_res.csv")

In [41]:
df_ham_res.tail()

Unnamed: 0,run_no,age_cat,gender,finish,idp
11752,F2031,M/W 45 1974-1978,W,03:57:46,HCHSK2IQ36219A
11753,F5073,M/W 45 1974-1978,W,,HCHSK2IQ361122
11754,F3383,M/W 40 1979-1983,W,03:41:35,HCHSK2IQ3621A5
11755,F5209,M/W 40 1979-1983,W,,HCHSK2IQ3627C6
11756,F2393,M/W 50 1969-1973,W,,HCHSK2IQ3606BB


In [42]:
df_ham_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   11757 non-null  object
 1   age_cat  11753 non-null  object
 2   gender   11757 non-null  object
 3   finish   8688 non-null   object
 4   idp      11757 non-null  object
dtypes: object(5)
memory usage: 459.4+ KB


#### Splits Pages

In [43]:
df_ham_res = pd.read_csv(ham_data_path+f"/{HAM_NAME}2023_res.csv")
ham_splits_urls, ham_splits_settings = hamburg.gen_splits_scrap_info(YEAR_23, df_ham_res["idp"].to_list(), HAM_SPLITS_FIELDS, 
                                                                     ham_data_path, show_settings=True)

Hamburg 2023 total splits pages: 11757
Example URLs: 
 https://hamburg.r.mikatiming.com/2023/?content=detail&fpid=search&pid=search&idp=HCHSK2IQ361E7A&lang=EN&event=HML&event_main_group=custom.meeting.marathon 
 https://hamburg.r.mikatiming.com/2023/?content=detail&fpid=search&pid=search&idp=HCHSK2IQ3606BB&lang=EN&event=HML&event_main_group=custom.meeting.marathon
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2023/Hamburg2023_splits.csv': {'format': 'csv', 'fields': ['idp', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(hamburg_spiders.Hamburg1823, urls=ham_splits_urls, settings=ham_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 1c6003cd45413067
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9068 pages (at 9068 pages/min), scraped 9057 items (at 9057 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (11757 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Hamburg/Hamburg2023/Hamburg2023_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4103193,
 'downloader/request_count': 11757,
 'downloader/request_method_count/GET': 11757,
 'downloader/response_bytes': 96671670,
 'downloader/response_count': 11757,
 'downloader/response_status_count/200': 11757,
 'elapsed_time_seconds': 78.106209,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 8, 18, 0, 48, 50, 273710),
 'httpcompression/response_bytes': 341990587,
 'httpcompression/response_count': 11757,
 'item_scraped_count': 11757,
 'log_count/INFO': 13,
 'memusage/max': 466550784,
 'memusage/startup': 466550784,
 'response_received_count': 11757,
 'scheduler/deq

In [46]:
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2023_splits.csv")

In [47]:
df_ham_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   idp       11757 non-null  object
 1   k_5       11703 non-null  object
 2   k_10      11703 non-null  object
 3   k_15      11703 non-null  object
 4   k_20      11703 non-null  object
 5   k_half    11703 non-null  object
 6   k_25      11703 non-null  object
 7   k_30      11703 non-null  object
 8   k_35      11703 non-null  object
 9   k_40      11703 non-null  object
 10  k_finish  11703 non-null  object
dtypes: object(11)
memory usage: 1010.5+ KB


#### Full Raw Dataset for Hamburg 2023

In [48]:
df_ham_res    = pd.read_csv(ham_data_path+f"/{HAM_NAME}2023_res.csv")
df_ham_splits = pd.read_csv(ham_data_path+f"/{HAM_NAME}2023_splits.csv")
df_ham_full   = pd.merge(df_ham_res, df_ham_splits, on="idp")

In [49]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   run_no    11757 non-null  object
 1   age_cat   11753 non-null  object
 2   gender    11757 non-null  object
 3   finish    8688 non-null   object
 4   idp       11757 non-null  object
 5   k_5       11703 non-null  object
 6   k_10      11703 non-null  object
 7   k_15      11703 non-null  object
 8   k_20      11703 non-null  object
 9   k_half    11703 non-null  object
 10  k_25      11703 non-null  object
 11  k_30      11703 non-null  object
 12  k_35      11703 non-null  object
 13  k_40      11703 non-null  object
 14  k_finish  11703 non-null  object
dtypes: object(15)
memory usage: 1.3+ MB


In [50]:
df_ham_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_ham_full = expand_splits(df_ham_full)
# Drop the splits Lists.
df_ham_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [51]:
df_ham_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11757 entries, 0 to 11756
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          11757 non-null  object
 1   age_cat         11753 non-null  object
 2   gender          11757 non-null  object
 3   finish          8688 non-null   object
 4   idp             11757 non-null  object
 5   k_5_time        11703 non-null  object
 6   k_5_pace        11703 non-null  object
 7   k_5_speed       11703 non-null  object
 8   k_10_time       11703 non-null  object
 9   k_10_pace       11703 non-null  object
 10  k_10_speed      11703 non-null  object
 11  k_15_time       11703 non-null  object
 12  k_15_pace       11703 non-null  object
 13  k_15_speed      11703 non-null  object
 14  k_20_time       11703 non-null  object
 15  k_20_pace       11703 non-null  object
 16  k_20_speed      11703 non-null  object
 17  k_half_time     11703 non-null  object
 18  k_half

In [52]:
df_ham_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,k_5_time,k_5_pace,k_5_speed,k_10_time,k_10_pace,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M/W 30 1989-1993,M,02:08:08,84VSK2IQ371AD2,00:15:25,03:05,19.46,00:30:17,02:59,...,19.82,01:46:06,03:00,20.00,02:01:25,03:04,19.59,02:08:08,03:04,19.61
1,10,M/W 30 1989-1993,M,,84VSK2IQ371ADB,00:15:33,03:07,19.29,00:30:45,03:03,...,15.50,-,-,-,-,-,-,-,-,-
2,10001,M/W 30 1989-1993,M,04:01:13,HCHSK2IQ361120,00:25:42,05:09,11.67,00:50:26,04:57,...,9.21,03:13:19,06:25,9.37,03:48:06,06:58,8.62,04:01:13,05:59,10.04
3,10002,M/W 30 1989-1993,M,,HCHSK2IQ36FDF6,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
4,10003,M/W 55 1964-1968,M,04:07:10,HCHSK2IQ36C5B7,00:27:08,05:26,11.06,00:53:19,05:15,...,9.36,03:21:45,06:12,9.68,03:54:28,06:33,9.17,04:07:10,05:48,10.37


In [53]:
df_ham_full.to_csv(ham_data_path+f"/{HAM_NAME}2023_full.csv", index=False)

In [54]:
del df_ham_full, df_ham_res, df_ham_splits, ham_data_path

## Houston (speed and pace is in miles/h and min/mile respectively)

In [31]:
HOU_NAME = "Houston"

# Hamburg URLs Templates.
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page.
HOUSTON_MARATHON_URL: str = "http://results.houstonmarathon.com/{0}/?pid=search&page={1}&sex={2}&num_results={3}&search_event=MARA"
# {0}: Year || {1}: runner id
HOUSTON_MARATHON_SPLIT_URL: str = "http://results.houstonmarathon.com/{0}/?content=detail&fpid=search&pid=search&idp={1}&lang=EN_CAP&event=MARA"

HOU_NUM_RESULTS: str = "1000"

HOU_RES_FIELDS: list[str] = ["run_no", "age_cat", "gender", "finish", "idp"]
HOU_SPLITS_FIELDS: list[str] = ["idp", "race_state", "k_5", "k_10", "k_15", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]

In [32]:
# Initialising london marathon object. 
houston = HoustonMarathon(url_template=HOUSTON_MARATHON_URL, split_url_template=HOUSTON_MARATHON_SPLIT_URL)

### 2018

In [33]:
hou_data_path = f"Marathons_Data/Raw/Houston/Houston{YEAR_18}"

#### Results Pages

In [34]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
hou_pages_urls, hou_res_settings = houston.gen_res_scrap_info(YEAR_18, HOU_NUM_RESULTS, HOU_RES_FIELDS, 
                                                              hou_data_path, show_settings=True)

Men Pages: 5 || Women Pages: 3
Houston 2018 total results pages: 8
Example URLs: 
 http://results.houstonmarathon.com/2018/?pid=search&page=1&sex=M&num_results=1000&search_event=MARA 
 http://results.houstonmarathon.com/2018/?pid=search&page=1&sex=W&num_results=1000&search_event=MARA
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2018/Houston2018_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(houston_spiders.Houston1819, hou_pages_urls, hou_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 312caf8e17241a96


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (7547 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2018/Houston2018_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2360,
 'downloader/request_count': 8,
 'downloader/request_method_count/GET': 8,
 'downloader/response_bytes': 811402,
 'downloader/response_count': 8,
 'downloader/response_status_count/200': 8,
 'elapsed_time_seconds': 4.090946,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 22, 6, 970386),
 'httpcompression/response_bytes': 16933908,
 'httpcompression/response_count': 8,
 'item_scraped_count': 7547,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 8,
 'scheduler/dequeued': 8,
 'scheduler/dequeued/memory': 8,
 'scheduler/enqueued': 8,
 'scheduler/enqueued/memory': 8,
 'start_time': dat

In [17]:
df_hou_res = pd.read_csv(hou_data_path+f"/{HOU_NAME}2018_res.csv")

In [18]:
df_hou_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,7916,50-54,M,05:46:51,99999912E1A2E700001A75FA
1,2686,50-54,M,03:38:29,99999912E1A2E700001A7D92
2,6997,45-49,M,04:00:53,99999912E1A2E700001AA604
3,2723,40-44,M,03:20:18,99999912E1A2E700001A8196
4,5077,30-34,M,05:46:47,99999912E1A2E700001A9F96


In [19]:
df_hou_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7547 entries, 0 to 7546
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   7547 non-null   int64 
 1   age_cat  7547 non-null   object
 2   gender   7547 non-null   object
 3   finish   7547 non-null   object
 4   idp      7547 non-null   object
dtypes: int64(1), object(4)
memory usage: 294.9+ KB


#### Splits Pages

In [36]:
df_hou_res = pd.read_csv(hou_data_path+f"/{HOU_NAME}2018_res.csv")
hou_splits_urls, hou_splits_settings = houston.gen_splits_scrap_info(YEAR_18, df_hou_res["idp"].to_list(), HOU_SPLITS_FIELDS, 
                                                                     hou_data_path, show_settings=True)

Houston 2018 total splits pages: 7547
Example URLs: 
 http://results.houstonmarathon.com/2018/?content=detail&fpid=search&pid=search&idp=99999912E1A2E700001A75FA&lang=EN_CAP&event=MARA 
 http://results.houstonmarathon.com/2018/?content=detail&fpid=search&pid=search&idp=99999912E1A2E700001AE75D&lang=EN_CAP&event=MARA
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2018/Houston2018_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'k_5', 'k_10', 'k_15', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [37]:
run_spider(houston_spiders.Houston1819, urls=hou_splits_urls, settings=hou_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 13c077d550f1501e


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (7547 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2018/Houston2018_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2460322,
 'downloader/request_count': 7547,
 'downloader/request_method_count/GET': 7547,
 'downloader/response_bytes': 57157539,
 'downloader/response_count': 7547,
 'downloader/response_status_count/200': 7547,
 'elapsed_time_seconds': 54.431283,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 23, 12, 520813),
 'httpcompression/response_bytes': 187907381,
 'httpcompression/response_count': 7547,
 'item_scraped_count': 7547,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 7547,
 'scheduler/dequeued': 7547,
 'scheduler/dequeued/memory': 7547,
 'scheduler/enqueued': 7547,
 'scheduler/e

In [25]:
df_hou_splits = pd.read_csv(hou_data_path+f"/{HOU_NAME}2018_splits.csv")

In [26]:
df_hou_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7547 entries, 0 to 7546
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         7547 non-null   object
 1   race_state  7547 non-null   object
 2   k_5         7547 non-null   object
 3   k_10        7547 non-null   object
 4   k_15        7547 non-null   object
 5   k_half      7547 non-null   object
 6   k_25        7547 non-null   object
 7   k_30        7547 non-null   object
 8   k_35        7547 non-null   object
 9   k_40        7547 non-null   object
 10  k_finish    7547 non-null   object
dtypes: object(11)
memory usage: 648.7+ KB


#### Full Raw Dataset for Houston 2018

In [27]:
df_hou_res    = pd.read_csv(hou_data_path+f"/{HOU_NAME}2018_res.csv")
df_hou_splits = pd.read_csv(hou_data_path+f"/{HOU_NAME}2018_splits.csv")
df_hou_full   = pd.merge(df_hou_res, df_hou_splits, on="idp")

In [28]:
df_hou_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7547 entries, 0 to 7546
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      7547 non-null   int64 
 1   age_cat     7547 non-null   object
 2   gender      7547 non-null   object
 3   finish      7547 non-null   object
 4   idp         7547 non-null   object
 5   race_state  7547 non-null   object
 6   k_5         7547 non-null   object
 7   k_10        7547 non-null   object
 8   k_15        7547 non-null   object
 9   k_half      7547 non-null   object
 10  k_25        7547 non-null   object
 11  k_30        7547 non-null   object
 12  k_35        7547 non-null   object
 13  k_40        7547 non-null   object
 14  k_finish    7547 non-null   object
dtypes: int64(1), object(14)
memory usage: 884.5+ KB


In [29]:
df_hou_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_hou_full = expand_splits(df_hou_full)
# Drop the splits Lists.
df_hou_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [30]:
df_hou_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7547 entries, 0 to 7546
Data columns (total 36 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          7547 non-null   int64  
 1   age_cat         7547 non-null   object 
 2   gender          7547 non-null   object 
 3   finish          7547 non-null   object 
 4   idp             7547 non-null   object 
 5   race_state      7547 non-null   object 
 6   k_5_time        7547 non-null   object 
 7   k_5_pace        7547 non-null   object 
 8   k_5_speed       7547 non-null   object 
 9   k_10_time       7547 non-null   object 
 10  k_10_pace       7547 non-null   object 
 11  k_10_speed      7547 non-null   object 
 12  k_15_time       7547 non-null   object 
 13  k_15_pace       7547 non-null   object 
 14  k_15_speed      7547 non-null   object 
 15  k_20_time       0 non-null      float64
 16  k_20_pace       0 non-null      float64
 17  k_20_speed      0 non-null      f

In [31]:
df_hou_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,race_state,k_5_time,k_5_pace,k_5_speed,k_10_time,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,Elites,M,DNF,99999912E1A2E700001AE91E,DNF,00:15:38,05:02,11.93,-,...,-,-,-,-,-,-,-,-,-,-
1,2,Elites,M,02:08:30,99999912E1A2E700001AEB4E,Finished,00:15:05,04:52,12.36,00:30:12,...,12.32,01:46:22,05:02,11.94,02:02:06,05:04,11.85,02:08:30,04:42,12.79
2,3,Elites,M,02:09:07,99999912E1A2E700001AEB4F,Finished,00:15:05,04:52,12.36,00:30:12,...,12.50,01:45:44,04:54,12.27,02:01:43,05:09,11.67,02:09:07,05:26,11.06
3,5,Elites,M,02:15:01,99999912E1A2E700001AE924,Finished,00:15:12,04:54,12.27,00:31:18,...,11.44,01:51:52,05:15,11.45,02:08:18,05:18,11.35,02:15:01,04:56,12.19
4,7,Elites,M,02:09:32,99999912E1A2E700001AE91F,Finished,00:15:05,04:52,12.36,00:30:12,...,12.27,01:46:32,05:04,11.88,02:02:29,05:08,11.69,02:09:32,05:11,11.61


In [32]:
df_hou_full.to_csv(hou_data_path+f"/{HOU_NAME}2018_full.csv", index=False)

In [33]:
del df_hou_full, df_hou_res, df_hou_splits, hou_data_path

### 2019

In [48]:
hou_data_path = f"Marathons_Data/Raw/Houston/Houston{YEAR_19}"

#### Results Pages

In [49]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
hou_pages_urls, hou_res_settings = houston.gen_res_scrap_info(YEAR_19, HOU_NUM_RESULTS, HOU_RES_FIELDS, 
                                                              hou_data_path, show_settings=True)

Men Pages: 5 || Women Pages: 3
Houston 2019 total results pages: 8
Example URLs: 
 http://results.houstonmarathon.com/2019/?pid=search&page=1&sex=M&num_results=1000&search_event=MARA 
 http://results.houstonmarathon.com/2019/?pid=search&page=1&sex=W&num_results=1000&search_event=MARA
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2019/Houston2019_res.csv': {'format': 'csv', 'fields': ['run_no', 'age_cat', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(houston_spiders.Houston1819, hou_pages_urls, hou_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 9d4cae0510e62460


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (7159 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2019/Houston2019_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2360,
 'downloader/request_count': 8,
 'downloader/request_method_count/GET': 8,
 'downloader/response_bytes': 727791,
 'downloader/response_count': 8,
 'downloader/response_status_count/200': 8,
 'elapsed_time_seconds': 3.488757,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 16, 17, 48, 1, 203968),
 'httpcompression/response_bytes': 15006840,
 'httpcompression/response_count': 8,
 'item_scraped_count': 7159,
 'log_count/INFO': 12,
 'memusage/max': 282656768,
 'memusage/startup': 282656768,
 'response_received_count': 8,
 'scheduler/dequeued': 8,
 'scheduler/dequeued/memory': 8,
 'scheduler/enqueued': 8,
 'scheduler/enqueued/memory': 8,
 'start_time': dat

In [23]:
df_hou_res = pd.read_csv(hou_data_path+f"/{HOU_NAME}2019_res.csv")

In [24]:
df_hou_res.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp
0,10004,45-49,M,04:32:12,99999912E1A2EA00001E860F
1,1502,50-54,M,03:44:40,99999912E1A2EA00001E493C
2,1503,50-54,M,02:50:32,99999912E1A2EA00001E80B6
3,5003,35-39,M,03:59:38,99999912E1A2EA00001E4FB3
4,1505,45-49,M,03:16:16,99999912E1A2EA00001E8477


In [25]:
df_hou_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7159 entries, 0 to 7158
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   7159 non-null   int64 
 1   age_cat  7159 non-null   object
 2   gender   7159 non-null   object
 3   finish   7159 non-null   object
 4   idp      7159 non-null   object
dtypes: int64(1), object(4)
memory usage: 279.8+ KB


#### Splits Pages

In [50]:
df_hou_res = pd.read_csv(hou_data_path+f"/{HOU_NAME}2019_res.csv")
hou_splits_urls, hou_splits_settings = houston.gen_splits_scrap_info(YEAR_19, df_hou_res["idp"].to_list(), HOU_SPLITS_FIELDS, 
                                                                     hou_data_path, show_settings=True)

Houston 2019 total splits pages: 7159
Example URLs: 
 http://results.houstonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=99999912E1A2EA00001E860F&lang=EN_CAP&event=MARA 
 http://results.houstonmarathon.com/2019/?content=detail&fpid=search&pid=search&idp=99999912E1A2EA00001E58FF&lang=EN_CAP&event=MARA
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2019/Houston2019_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'k_5', 'k_10', 'k_15', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(houston_spiders.Houston1819, urls=hou_splits_urls, settings=hou_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: ec6a15b8d1a2a8b2
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (7159 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Houston/Houston2019/Houston2019_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2333834,
 'downloader/request_count': 7159,
 'downloader/request_method_count/GET': 7159,
 'downloader/response_bytes': 55665111,
 'downloader/response_count': 7159,
 'downloader/response_status_count/200': 7159,
 'elapsed_time_seconds': 40.710724,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 16, 18, 22, 18, 519055),
 'httpcompression/response_bytes': 191065291,
 'httpcompression/response_count': 7159,
 'item_scraped_count': 7159,
 'log_count/INFO': 12,
 'memusage/max': 145588224,
 'memusage/startup': 145588224,
 'response_received_count': 7159,
 'scheduler/dequeued': 7159,
 'scheduler/dequeued/memory': 7159,
 'scheduler/enqueued': 7159,
 'scheduler/e

In [23]:
df_hou_splits = pd.read_csv(hou_data_path+f"/{HOU_NAME}2019_splits.csv")

In [11]:
df_hou_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7159 entries, 0 to 7158
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         7159 non-null   object
 1   race_state  7159 non-null   object
 2   k_5         7159 non-null   object
 3   k_10        7159 non-null   object
 4   k_15        7159 non-null   object
 5   k_half      7159 non-null   object
 6   k_25        7159 non-null   object
 7   k_30        7159 non-null   object
 8   k_35        7159 non-null   object
 9   k_40        7159 non-null   object
 10  k_finish    7159 non-null   object
dtypes: object(11)
memory usage: 615.4+ KB


#### Full Raw Dataset for Houston 2019

In [12]:
df_hou_res    = pd.read_csv(hou_data_path+f"/{HOU_NAME}2019_res.csv")
df_hou_splits = pd.read_csv(hou_data_path+f"/{HOU_NAME}2019_splits.csv")
df_hou_full   = pd.merge(df_hou_res, df_hou_splits, on="idp")

In [13]:
df_hou_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7159 entries, 0 to 7158
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      7159 non-null   int64 
 1   age_cat     7159 non-null   object
 2   gender      7159 non-null   object
 3   finish      7159 non-null   object
 4   idp         7159 non-null   object
 5   race_state  7159 non-null   object
 6   k_5         7159 non-null   object
 7   k_10        7159 non-null   object
 8   k_15        7159 non-null   object
 9   k_half      7159 non-null   object
 10  k_25        7159 non-null   object
 11  k_30        7159 non-null   object
 12  k_35        7159 non-null   object
 13  k_40        7159 non-null   object
 14  k_finish    7159 non-null   object
dtypes: int64(1), object(14)
memory usage: 839.1+ KB


In [14]:
df_hou_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_hou_full = expand_splits(df_hou_full)
# Drop the splits Lists.
df_hou_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [15]:
df_hou_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7159 entries, 0 to 7158
Data columns (total 36 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          7159 non-null   int64  
 1   age_cat         7159 non-null   object 
 2   gender          7159 non-null   object 
 3   finish          7159 non-null   object 
 4   idp             7159 non-null   object 
 5   race_state      7159 non-null   object 
 6   k_5_time        7159 non-null   object 
 7   k_5_pace        7159 non-null   object 
 8   k_5_speed       7159 non-null   object 
 9   k_10_time       7159 non-null   object 
 10  k_10_pace       7159 non-null   object 
 11  k_10_speed      7159 non-null   object 
 12  k_15_time       7159 non-null   object 
 13  k_15_pace       7159 non-null   object 
 14  k_15_speed      7159 non-null   object 
 15  k_20_time       0 non-null      float64
 16  k_20_pace       0 non-null      float64
 17  k_20_speed      0 non-null      f

In [16]:
df_hou_full.head()

Unnamed: 0,run_no,age_cat,gender,finish,idp,race_state,k_5_time,k_5_pace,k_5_speed,k_10_time,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,Elites,M,02:10:08,99999912E1A2EA00001E51CD,Finished,00:15:30,05:00,12.03,00:30:49,...,12.43,01:48:51,05:01,11.99,02:03:42,04:47,12.56,02:10:08,04:43,12.72
1,2,Elites,M,02:11:30,99999912E1A2EA00001E51D0,Finished,00:15:26,04:58,12.08,00:30:48,...,12.42,01:48:51,05:00,12.00,02:04:28,05:02,11.94,02:11:30,05:10,11.64
2,3,Elites,M,02:14:51,99999912E1A2EA00001E509D,Finished,00:15:30,05:00,12.03,00:30:51,...,11.73,01:51:22,05:26,11.04,02:07:37,05:14,11.47,02:14:51,05:19,11.32
3,4,Elites,M,DNF,99999912E1A2EA00001E51CB,DNF,00:15:30,05:00,12.03,00:31:00,...,-,-,-,-,-,-,-,-,-,-
4,5,Elites,M,02:10:02,99999912E1A2EA00001E8B49,Finished,00:15:27,04:59,12.07,00:30:49,...,12.42,01:48:50,05:00,12.00,02:03:42,04:48,12.54,02:10:02,04:39,12.92


In [18]:
df_hou_full.to_csv(hou_data_path+f"/{HOU_NAME}2019_full.csv", index=False)

In [19]:
del df_hou_full, df_hou_res, df_hou_splits, hou_data_path

## Stockholm (Provide YOB year of birth instead of age category).

In [38]:
STO_NAME = "Stockholm"

# Stockholm URLs Templates.
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page.
STOCKHOLM_MARATHON_URL: str = "http://results.marathon.se/{0}/?pid=search&pidp=start&page={1}&sex={2}&num_results={3}&lang=EN_CAP&event=STHM"

# {0}: Year || {1}: runner id
STOCKHOLM_MARATHON_SPLIT_URL: str = "http://results.marathon.se/{0}/?content=detail&fpid=search&pid=search&idp={1}&lang=EN_CAP&event=STHM"

STO_NUM_RESULTS: str = "250"

STO_RES_FIELDS: list[str] = ["run_no", "gender", "finish", "idp"]
STO_SPLITS_FIELDS: list[str] = ["idp", "race_state", "yob", "k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]


In [39]:
stockholm = StockHolmMarathon(url_template=STOCKHOLM_MARATHON_URL, split_url_template=STOCKHOLM_MARATHON_SPLIT_URL)

### 2021

In [40]:
sto_data_path = f"Marathons_Data/Raw/Stockholm/Stockholm{YEAR_21}"

#### Results Pages

In [41]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
sto_pages_urls, sto_res_settings = stockholm.gen_res_scrap_info(YEAR_21, STO_NUM_RESULTS, STO_RES_FIELDS, 
                                                                sto_data_path, show_settings=True)

Men Pages: 35 || Women Pages: 14
Stockholm 2021 total results pages: 49
Example URLs: 
 http://results.marathon.se/2021/?pid=search&pidp=start&page=1&sex=M&num_results=250&lang=EN_CAP&event=STHM 
 http://results.marathon.se/2021/?pid=search&pidp=start&page=1&sex=W&num_results=250&lang=EN_CAP&event=STHM
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2021/Stockholm2021_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(stockholm_spiders.Stockholm2122, sto_pages_urls, sto_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 050f993d4e70d072


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (12179 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2021/Stockholm2021_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 14829,
 'downloader/request_count': 49,
 'downloader/request_method_count/GET': 49,
 'downloader/response_bytes': 1171099,
 'downloader/response_count': 49,
 'downloader/response_status_count/200': 49,
 'elapsed_time_seconds': 9.42462,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 27, 21, 46067),
 'httpcompression/response_bytes': 19684441,
 'httpcompression/response_count': 49,
 'item_scraped_count': 12179,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 49,
 'scheduler/dequeued': 49,
 'scheduler/dequeued/memory': 49,
 'scheduler/enqueued': 49,
 'scheduler/enqueued/memory': 49,

In [11]:
df_sto_res = pd.read_csv(sto_data_path+f"/{STO_NAME}2021_res.csv")

In [12]:
df_sto_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,11428,M,04:26:42,9TGBEQLS87B46
1,11180,M,04:39:11,9TGBEQLS86E23
2,8038,M,04:04:16,9TGBEQLS84C13
3,6590,M,04:15:54,9TGBEQLS86D37
4,3340,M,03:06:04,9TGBEQLS882AC


In [13]:
df_sto_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12179 entries, 0 to 12178
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  12179 non-null  int64 
 1   gender  12179 non-null  object
 2   finish  6958 non-null   object
 3   idp     12179 non-null  object
dtypes: int64(1), object(3)
memory usage: 380.7+ KB


#### Splits Pages

In [43]:
df_sto_res = pd.read_csv(sto_data_path+f"/{STO_NAME}2021_res.csv")
sto_splits_urls, sto_splits_settings = stockholm.gen_splits_scrap_info(YEAR_21, df_sto_res["idp"].to_list(), STO_SPLITS_FIELDS, 
                                                                       sto_data_path, show_settings=True)

Stockholm 2021 total splits pages: 12179
Example URLs: 
 http://results.marathon.se/2021/?content=detail&fpid=search&pid=search&idp=9TGBEQLS843EF&lang=EN_CAP&event=STHM 
 http://results.marathon.se/2021/?content=detail&fpid=search&pid=search&idp=9TGBEQLS83F39&lang=EN_CAP&event=STHM
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2021/Stockholm2021_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'yob', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(stockholm_spiders.Stockholm2122, urls=sto_splits_urls, settings=sto_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 713c67682bb13263


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (100 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2021/Stockholm2021_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 30700,
 'downloader/request_count': 100,
 'downloader/request_method_count/GET': 100,
 'downloader/response_bytes': 905577,
 'downloader/response_count': 100,
 'downloader/response_status_count/200': 100,
 'elapsed_time_seconds': 1.090697,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 27, 50, 321878),
 'httpcompression/response_bytes': 3337287,
 'httpcompression/response_count': 100,
 'item_scraped_count': 100,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 100,
 'scheduler/dequeued': 100,
 'scheduler/dequeued/memory': 100,
 'scheduler/enqueued': 100,
 'scheduler/enqueued/memo

In [11]:
df_sto_splits = pd.read_csv(sto_data_path+f"/{STO_NAME}2021_splits.csv")

In [12]:
df_sto_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12179 entries, 0 to 12178
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   idp         12179 non-null  object 
 1   race_state  6958 non-null   object 
 2   yob         12108 non-null  float64
 3   k_5         12179 non-null  object 
 4   k_10        12179 non-null  object 
 5   k_15        12179 non-null  object 
 6   k_20        12179 non-null  object 
 7   k_half      12179 non-null  object 
 8   k_25        12179 non-null  object 
 9   k_30        12179 non-null  object 
 10  k_35        12179 non-null  object 
 11  k_40        12179 non-null  object 
 12  k_finish    12179 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.2+ MB


#### Full Raw Dataset for Stockholm 2021

In [25]:
df_sto_res    = pd.read_csv(sto_data_path+f"/{STO_NAME}2021_res.csv")
df_sto_splits = pd.read_csv(sto_data_path+f"/{STO_NAME}2021_splits.csv")
df_sto_full   = pd.merge(df_sto_res, df_sto_splits, on="idp")

In [26]:
df_sto_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12179 entries, 0 to 12178
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   run_no      12179 non-null  int64  
 1   gender      12179 non-null  object 
 2   finish      6958 non-null   object 
 3   idp         12179 non-null  object 
 4   race_state  6958 non-null   object 
 5   yob         12108 non-null  float64
 6   k_5         12179 non-null  object 
 7   k_10        12179 non-null  object 
 8   k_15        12179 non-null  object 
 9   k_20        12179 non-null  object 
 10  k_half      12179 non-null  object 
 11  k_25        12179 non-null  object 
 12  k_30        12179 non-null  object 
 13  k_35        12179 non-null  object 
 14  k_40        12179 non-null  object 
 15  k_finish    12179 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 1.5+ MB


In [27]:
df_sto_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_sto_full = expand_splits(df_sto_full)
# Drop the splits Lists.
df_sto_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [28]:
df_sto_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12179 entries, 0 to 12178
Data columns (total 36 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          12179 non-null  int64  
 1   gender          12179 non-null  object 
 2   finish          6958 non-null   object 
 3   idp             12179 non-null  object 
 4   race_state      6958 non-null   object 
 5   yob             12108 non-null  float64
 6   k_5_time        12179 non-null  object 
 7   k_5_pace        12179 non-null  object 
 8   k_5_speed       12179 non-null  object 
 9   k_10_time       12179 non-null  object 
 10  k_10_pace       12179 non-null  object 
 11  k_10_speed      12179 non-null  object 
 12  k_15_time       12179 non-null  object 
 13  k_15_pace       12179 non-null  object 
 14  k_15_speed      12179 non-null  object 
 15  k_20_time       12179 non-null  object 
 16  k_20_pace       12179 non-null  object 
 17  k_20_speed      12179 non-null 

In [29]:
df_sto_full.head()

Unnamed: 0,run_no,gender,finish,idp,race_state,yob,k_5_time,k_5_pace,k_5_speed,k_10_time,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:12:30,9TGBEQLS879EF,Finished,91.0,00:15:07,03:02,19.85,00:30:41,...,18.89,01:50:10,03:07,19.33,02:05:40,03:06,19.35,02:12:30,03:07,19.27
1,2,M,02:12:24,9TGBEQLS879F0,Finished,93.0,00:15:06,03:02,19.87,00:30:42,...,18.89,01:50:10,03:07,19.33,02:05:40,03:06,19.35,02:12:24,03:05,19.56
2,3,M,02:14:32,9TGBEQLS879F1,Finished,88.0,00:15:07,03:02,19.85,00:30:43,...,18.79,01:50:46,03:13,18.73,02:07:03,03:16,18.42,02:14:32,03:25,17.6
3,4,M,02:13:53,9TGBEQLS879F2,Finished,89.0,00:15:06,03:02,19.87,00:30:41,...,18.87,01:50:10,03:06,19.35,02:05:56,03:10,19.03,02:13:53,03:38,16.57
4,5,M,02:15:43,9TGBEQLS879F4,Finished,87.0,00:15:07,03:02,19.85,00:30:41,...,18.73,-,-,-,02:07:43,03:14,18.59,02:15:43,03:39,16.46


In [30]:
df_sto_full.to_csv(sto_data_path+f"/{STO_NAME}2021_full.csv", index=False)

In [31]:
del df_sto_full, df_sto_res, df_sto_splits, sto_data_path

### 2022

In [57]:
sto_data_path = f"Marathons_Data/Raw/Stockholm/Stockholm{YEAR_22}"

#### Results Pages

In [58]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
sto_pages_urls, sto_res_settings = stockholm.gen_res_scrap_info(YEAR_22, STO_NUM_RESULTS, STO_RES_FIELDS, 
                                                                sto_data_path, show_settings=True)

Men Pages: 39 || Women Pages: 16
Stockholm 2022 total results pages: 55
Example URLs: 
 http://results.marathon.se/2022/?pid=search&pidp=start&page=1&sex=M&num_results=250&lang=EN_CAP&event=STHM 
 http://results.marathon.se/2022/?pid=search&pidp=start&page=1&sex=W&num_results=250&lang=EN_CAP&event=STHM
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2022/Stockholm2022_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(stockholm_spiders.Stockholm2122, sto_pages_urls, sto_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 96fe6748a53bdd4a
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (13593 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2022/Stockholm2022_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 16647,
 'downloader/request_count': 55,
 'downloader/request_method_count/GET': 55,
 'downloader/response_bytes': 1432603,
 'downloader/response_count': 55,
 'downloader/response_status_count/200': 55,
 'elapsed_time_seconds': 11.22234,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 19, 21, 12, 29, 710559),
 'httpcompression/response_bytes': 23833075,
 'httpcompression/response_count': 55,
 'item_scraped_count': 13593,
 'log_count/INFO': 12,
 'memusage/max': 155680768,
 'memusage/startup': 155680768,
 'response_received_count': 55,
 'scheduler/dequeued': 55,
 'scheduler/dequeued/memory': 55,
 'scheduler/enqueued': 55,
 'scheduler/enqueued/memory': 5

In [10]:
df_sto_res = pd.read_csv(sto_data_path+f"/{STO_NAME}2022_res.csv")

In [11]:
df_sto_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,3672,M,03:50:20,BH2BEQLSA76A3
1,6145,M,04:11:50,BH2BEQLSA6390
2,6424,M,03:39:34,BH2BEQLSA6E7D
3,3504,M,03:47:13,BH2BEQLSA6E09
4,16103,M,04:05:08,BH2BEQLSAA1D3


In [12]:
df_sto_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13593 entries, 0 to 13592
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  13593 non-null  int64 
 1   gender  13593 non-null  object
 2   finish  9499 non-null   object
 3   idp     13593 non-null  object
dtypes: int64(1), object(3)
memory usage: 424.9+ KB


#### Splits Pages

In [59]:
df_sto_res = pd.read_csv(sto_data_path+f"/{STO_NAME}2022_res.csv")
sto_splits_urls, sto_splits_settings = stockholm.gen_splits_scrap_info(YEAR_22, df_sto_res["idp"].to_list(), STO_SPLITS_FIELDS, 
                                                                       sto_data_path, show_settings=True)

Stockholm 2022 total splits pages: 13593
Example URLs: 
 http://results.marathon.se/2022/?content=detail&fpid=search&pid=search&idp=BH2BEQLSA76A3&lang=EN_CAP&event=STHM 
 http://results.marathon.se/2022/?content=detail&fpid=search&pid=search&idp=BH2BEQLSA78A1&lang=EN_CAP&event=STHM
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2022/Stockholm2022_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'yob', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(stockholm_spiders.Stockholm2122, urls=sto_splits_urls, settings=sto_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: be9c4fccc49034ad
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 7114 pages (at 7114 pages/min), scraped 7102 items (at 7102 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (13593 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Stockholm/Stockholm2022/Stockholm2022_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4173051,
 'downloader/request_count': 13593,
 'downloader/request_method_count/GET': 13593,
 'downloader/response_bytes': 127187239,
 'downloader/response_count': 13593,
 'downloader/response_status_count/200': 13593,
 'elapsed_time_seconds': 111.362578,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 19, 21, 21, 40, 503043),
 'httpcompression/response_bytes': 467445897,
 'httpcompression/response_count': 13593,
 'item_scraped_count': 13593,
 'log_count/INFO': 13,
 'memusage/max': 221085696,
 'memusage/startup': 176717824,
 'response_received_count': 13593,
 'sche

In [12]:
df_sto_splits = pd.read_csv(sto_data_path+f"/{STO_NAME}2022_splits.csv")

In [13]:
df_sto_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13593 entries, 0 to 13592
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   idp         13593 non-null  object 
 1   race_state  9498 non-null   object 
 2   yob         13460 non-null  float64
 3   k_5         13592 non-null  object 
 4   k_10        13592 non-null  object 
 5   k_15        13592 non-null  object 
 6   k_20        13592 non-null  object 
 7   k_half      13592 non-null  object 
 8   k_25        13592 non-null  object 
 9   k_30        13592 non-null  object 
 10  k_35        13592 non-null  object 
 11  k_40        13592 non-null  object 
 12  k_finish    13592 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.3+ MB


#### Full Raw Dataset for Stockholm 2022

In [16]:
df_sto_res    = pd.read_csv(sto_data_path+f"/{STO_NAME}2022_res.csv")
df_sto_splits = pd.read_csv(sto_data_path+f"/{STO_NAME}2022_splits.csv")
df_sto_full   = pd.merge(df_sto_res, df_sto_splits, on="idp")

In [17]:
df_sto_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13593 entries, 0 to 13592
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   run_no      13593 non-null  int64  
 1   gender      13593 non-null  object 
 2   finish      9499 non-null   object 
 3   idp         13593 non-null  object 
 4   race_state  9498 non-null   object 
 5   yob         13460 non-null  float64
 6   k_5         13592 non-null  object 
 7   k_10        13592 non-null  object 
 8   k_15        13592 non-null  object 
 9   k_20        13592 non-null  object 
 10  k_half      13592 non-null  object 
 11  k_25        13592 non-null  object 
 12  k_30        13592 non-null  object 
 13  k_35        13592 non-null  object 
 14  k_40        13592 non-null  object 
 15  k_finish    13592 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 1.7+ MB


In [18]:
df_sto_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_sto_full = expand_splits(df_sto_full)
# Drop the splits Lists.
df_sto_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [19]:
df_sto_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13593 entries, 0 to 13592
Data columns (total 36 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   run_no          13593 non-null  int64  
 1   gender          13593 non-null  object 
 2   finish          9499 non-null   object 
 3   idp             13593 non-null  object 
 4   race_state      9498 non-null   object 
 5   yob             13460 non-null  float64
 6   k_5_time        13592 non-null  object 
 7   k_5_pace        13592 non-null  object 
 8   k_5_speed       13592 non-null  object 
 9   k_10_time       13592 non-null  object 
 10  k_10_pace       13592 non-null  object 
 11  k_10_speed      13592 non-null  object 
 12  k_15_time       13592 non-null  object 
 13  k_15_pace       13592 non-null  object 
 14  k_15_speed      13592 non-null  object 
 15  k_20_time       13592 non-null  object 
 16  k_20_pace       13592 non-null  object 
 17  k_20_speed      13592 non-null 

In [20]:
df_sto_full.head()

Unnamed: 0,run_no,gender,finish,idp,race_state,yob,k_5_time,k_5_pace,k_5_speed,k_10_time,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,,BH2BEQLSAA0CB,,,00:14:54,02:59,20.13,00:29:56,...,16.93,01:49:44,03:19,18.11,-,-,-,-,-,-
1,2,M,,BH2BEQLSAA0D3,,88.0,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,3,M,,BH2BEQLSAA0CF,,90.0,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,4,M,02:11:45,BH2BEQLSAA0CE,Finished,86.0,00:14:55,02:59,20.11,00:29:56,...,16.95,01:48:27,03:04,19.63,02:04:19,03:11,18.91,02:11:45,03:24,17.72
4,5,M,,BH2BEQLSAA0CC,,97.0,00:14:55,02:59,20.11,00:30:09,...,-,-,-,-,-,-,-,-,-,-


In [21]:
df_sto_full.to_csv(sto_data_path+f"/{STO_NAME}2022_full.csv", index=False)

In [22]:
del df_sto_full, df_sto_res, df_sto_splits, sto_data_path

## Boston
#### N.B (speed and pace is in miles/h and min/mile respectively)
#### N.B Some runners have last_split set to the extra splits provided in mile (20 miles, 21 miles, or 25.2 miles).

In [45]:
BOS_NAME = "Boston"

# Boston URLs Templates.
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page.
BOSTON_MARATHON_URL: str = "https://results.baa.org/{0}/?pid=search&pidp=start&page={1}&sex={2}&num_results={3}&event=R"

# {0}: Year || {1}: runner id
BOSTON_MARATHON_SPLIT_URL: str = "https://results.baa.org/{0}/?content=detail&fpid=search&pid=search&idp={1}&lang=EN&event=R"

BOS_NUM_RESULTS: str = "1000"

BOS_RES_FIELDS: list[str] = ["run_no", "gender", "finish", "idp"]
BOS_SPLITS_FIELDS: list[str] = ["idp", "age_cat", "race_state", "last_split", "k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]


In [46]:
boston = BostonMarathon(url_template=BOSTON_MARATHON_URL, split_url_template=BOSTON_MARATHON_SPLIT_URL)

### 2014

In [47]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_14}"

#### Results Pages

In [49]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_14, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 20 || Women Pages: 17
Boston 2014 total results pages: 37
Example URLs: 
 https://results.baa.org/2014/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2014/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2014/Boston2014_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 63cef1eef55d6305


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (35671 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2014/Boston2014_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 10527,
 'downloader/request_count': 37,
 'downloader/request_method_count/GET': 37,
 'downloader/response_bytes': 2560287,
 'downloader/response_count': 37,
 'downloader/response_status_count/200': 37,
 'elapsed_time_seconds': 9.06569,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 30, 56, 140710),
 'httpcompression/response_bytes': 28083782,
 'httpcompression/response_count': 37,
 'item_scraped_count': 35671,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 37,
 'scheduler/dequeued': 37,
 'scheduler/dequeued/memory': 37,
 'scheduler/enqueued': 37,
 'scheduler/enqueued/memory': 37,
 'start

In [11]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2014_res.csv")

In [12]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,19454,M,03:50:19,999999117A732600000A521F
1,27632,M,05:11:46,999999117A732600000A7A5B
2,34631,M,04:56:25,999999117A732600000A851B
3,12791,M,03:44:32,999999117A732600000A9A64
4,5156,M,,999999117A732600000A4843


In [13]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35671 entries, 0 to 35670
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  35671 non-null  object
 1   gender  35671 non-null  object
 2   finish  31815 non-null  object
 3   idp     35671 non-null  object
dtypes: object(4)
memory usage: 1.1+ MB


#### Splits Pages

In [51]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2014_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_14, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2014 total splits pages: 35671
Example URLs: 
 https://results.baa.org/2014/?content=detail&fpid=search&pid=search&idp=999999117A732600000A521F&lang=EN&event=R 
 https://results.baa.org/2014/?content=detail&fpid=search&pid=search&idp=999999117A732600000A4286&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2014/Boston2014_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, urls=bos_splits_urls, settings=bos_splits_settings, splits=True, first_split_idx=2)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 09051a1347ef9618


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (10 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2014/Boston2014_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3070,
 'downloader/request_count': 10,
 'downloader/request_method_count/GET': 10,
 'downloader/response_bytes': 55133,
 'downloader/response_count': 10,
 'downloader/response_status_count/200': 10,
 'elapsed_time_seconds': 0.336822,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 33, 3, 930914),
 'httpcompression/response_bytes': 170148,
 'httpcompression/response_count': 10,
 'item_scraped_count': 10,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 10,
 'scheduler/dequeued': 10,
 'scheduler/dequeued/memory': 10,
 'scheduler/enqueued': 10,
 'scheduler/enqueued/memory': 10,
 'start_time': 

In [12]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2014_splits.csv")

In [13]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35671 entries, 0 to 35670
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         35671 non-null  object
 1   age_cat     35671 non-null  object
 2   race_state  35671 non-null  object
 3   last_split  32465 non-null  object
 4   k_5         35661 non-null  object
 5   k_10        35661 non-null  object
 6   k_15        35661 non-null  object
 7   k_20        35661 non-null  object
 8   k_half      35661 non-null  object
 9   k_25        35661 non-null  object
 10  k_30        35661 non-null  object
 11  k_35        35661 non-null  object
 12  k_40        35661 non-null  object
 13  k_finish    35661 non-null  object
dtypes: object(14)
memory usage: 3.8+ MB


#### Full Raw Dataset for Boston 2014

In [14]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2014_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2014_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [15]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35671 entries, 0 to 35670
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      35671 non-null  object
 1   gender      35671 non-null  object
 2   finish      31815 non-null  object
 3   idp         35671 non-null  object
 4   age_cat     35671 non-null  object
 5   race_state  35671 non-null  object
 6   last_split  32465 non-null  object
 7   k_5         35661 non-null  object
 8   k_10        35661 non-null  object
 9   k_15        35661 non-null  object
 10  k_20        35661 non-null  object
 11  k_half      35661 non-null  object
 12  k_25        35661 non-null  object
 13  k_30        35661 non-null  object
 14  k_35        35661 non-null  object
 15  k_40        35661 non-null  object
 16  k_finish    35661 non-null  object
dtypes: object(17)
memory usage: 4.6+ MB


In [16]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [17]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35671 entries, 0 to 35670
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          35671 non-null  object
 1   gender          35671 non-null  object
 2   finish          31815 non-null  object
 3   idp             35671 non-null  object
 4   age_cat         35671 non-null  object
 5   race_state      35671 non-null  object
 6   last_split      32465 non-null  object
 7   k_5_time        35661 non-null  object
 8   k_5_pace        35661 non-null  object
 9   k_5_speed       35661 non-null  object
 10  k_10_time       35661 non-null  object
 11  k_10_pace       35661 non-null  object
 12  k_10_speed      35661 non-null  object
 13  k_15_time       35661 non-null  object
 14  k_15_pace       35661 non-null  object
 15  k_15_speed      35661 non-null  object
 16  k_20_time       35661 non-null  object
 17  k_20_pace       35661 non-null  object
 18  k_20_s

In [18]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,,999999117A732600000A7E5B,18-39,started,35K,00:15:10,-,-,...,11.99,01:49:31,05:29,10.97,-,-,-,-,-,-
1,100,M,,999999117A732600000A8257,55-59,not started,,'-','-',-,...,-,'-','-',-,'-','-',-,'-','-',-
2,1000,M,02:49:30,999999117A732600000A1688,18-39,finished,Finish Net,00:20:18,06:24,9.39,...,9.26,02:20:35,06:34,9.14,02:40:32,06:26,9.35,02:49:30,06:35,9.13
3,10000,M,03:14:16,999999117A732600000A19A3,55-59,finished,Finish Net,00:22:58,07:15,8.29,...,7.91,02:39:32,07:52,7.64,03:03:52,07:50,7.66,03:14:16,07:38,7.87
4,10001,M,,999999117A732600000A2622,50-54,not started,,'-','-',-,...,-,'-','-',-,'-','-',-,'-','-',-


In [19]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2014_full.csv", index=False)

In [20]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2015

In [66]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_15}"

#### Results Pages

In [67]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_15, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 14
Boston 2015 total results pages: 31
Example URLs: 
 https://results.baa.org/2015/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2015/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2015/Boston2015_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: c59bf932fc050dad


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (30252 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2015/Boston2015_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8817,
 'downloader/request_count': 31,
 'downloader/request_method_count/GET': 31,
 'downloader/response_bytes': 2158398,
 'downloader/response_count': 31,
 'downloader/response_status_count/200': 31,
 'elapsed_time_seconds': 7.70067,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 20, 15, 15, 21, 39898),
 'httpcompression/response_bytes': 23782577,
 'httpcompression/response_count': 31,
 'item_scraped_count': 30252,
 'log_count/INFO': 12,
 'memusage/max': 432766976,
 'memusage/startup': 432766976,
 'response_received_count': 31,
 'scheduler/dequeued': 31,
 'scheduler/dequeued/memory': 31,
 'scheduler/enqueued': 31,
 'scheduler/enqueued/memory': 31,
 'start_t

In [66]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2015_res.csv")

In [67]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,99999,M,,000017117A732600000BC031
1,14001,M,03:31:06,999999117A732600000B795B
2,1778,M,03:00:03,999999117A732600000BA5D4
3,9234,M,03:14:50,999999117A732600000B7B2F
4,4266,M,03:08:12,999999117A732600000B8BA8


In [68]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30252 entries, 0 to 30251
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  30252 non-null  object
 1   gender  30252 non-null  object
 2   finish  26617 non-null  object
 3   idp     30252 non-null  object
dtypes: object(4)
memory usage: 945.5+ KB


#### Splits Pages

In [68]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2015_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_15, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2015 total splits pages: 30252
Example URLs: 
 https://results.baa.org/2015/?content=detail&fpid=search&pid=search&idp=000017117A732600000BC031&lang=EN&event=R 
 https://results.baa.org/2015/?content=detail&fpid=search&pid=search&idp=999999117A732600000B52A0&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2015/Boston2015_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: abdb6f6fd50bae28


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9348 pages (at 9348 pages/min), scraped 9319 items (at 9319 items/min)
INFO: Crawled 18812 pages (at 9464 pages/min), scraped 18796 items (at 9477 items/min)
INFO: Crawled 28274 pages (at 9462 pages/min), scraped 28259 items (at 9463 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (30252 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2015/Boston2015_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9287364,
 'downloader/request_count': 30252,
 'downloader/request_method_count/GET': 30252,
 'downloader/response_bytes': 165519704,
 'downloader/response_count': 30252,
 'downloader/response_status_count/200': 30252,
 'elapsed_time_seconds': 192.518951,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 20, 15, 36, 34, 154696),
 'httpcompression/response_bytes': 507806244,
 'httpcompression/response_coun

In [24]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2015_splits.csv")

In [25]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30252 entries, 0 to 30251
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         30252 non-null  object
 1   age_cat     30252 non-null  object
 2   race_state  30252 non-null  object
 3   last_split  27166 non-null  object
 4   k_5         30245 non-null  object
 5   k_10        30245 non-null  object
 6   k_15        30245 non-null  object
 7   k_20        30245 non-null  object
 8   k_half      30245 non-null  object
 9   k_25        30245 non-null  object
 10  k_30        30245 non-null  object
 11  k_35        30245 non-null  object
 12  k_40        30245 non-null  object
 13  k_finish    30245 non-null  object
dtypes: object(14)
memory usage: 3.2+ MB


#### Full Raw Dataset for Boston 2015

In [10]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2015_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2015_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [11]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30252 entries, 0 to 30251
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      30252 non-null  object
 1   gender      30252 non-null  object
 2   finish      26617 non-null  object
 3   idp         30252 non-null  object
 4   age_cat     30252 non-null  object
 5   race_state  30252 non-null  object
 6   last_split  27166 non-null  object
 7   k_5         30245 non-null  object
 8   k_10        30245 non-null  object
 9   k_15        30245 non-null  object
 10  k_20        30245 non-null  object
 11  k_half      30245 non-null  object
 12  k_25        30245 non-null  object
 13  k_30        30245 non-null  object
 14  k_35        30245 non-null  object
 15  k_40        30245 non-null  object
 16  k_finish    30245 non-null  object
dtypes: object(17)
memory usage: 3.9+ MB


In [12]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [13]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30252 entries, 0 to 30251
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          30252 non-null  object
 1   gender          30252 non-null  object
 2   finish          26617 non-null  object
 3   idp             30252 non-null  object
 4   age_cat         30252 non-null  object
 5   race_state      30252 non-null  object
 6   last_split      27166 non-null  object
 7   k_5_time        30245 non-null  object
 8   k_5_pace        30245 non-null  object
 9   k_5_speed       30245 non-null  object
 10  k_10_time       30245 non-null  object
 11  k_10_pace       30245 non-null  object
 12  k_10_speed      30245 non-null  object
 13  k_15_time       30245 non-null  object
 14  k_15_pace       30245 non-null  object
 15  k_15_speed      30245 non-null  object
 16  k_20_time       30245 non-null  object
 17  k_20_pace       30245 non-null  object
 18  k_20_s

In [14]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:12:42,999999117A732600000B9A53,18-39,finished,Finish Net,00:14:44,04:45,12.66,...,11.75,01:47:59,05:09,11.65,02:04:58,05:28,10.98,02:12:42,05:41,10.58
1,10,M,02:10:49,999999117A732600000B9A4E,18-39,finished,Finish Net,00:14:43,04:45,12.67,...,11.74,01:47:59,05:09,11.67,02:03:27,04:59,12.05,02:10:49,05:24,11.11
2,100,M,,999999117A732600000B84DD,60-64,not started,,'-','-',-,...,-,'-','-',-,'-','-',-,'-','-',-
3,1000,M,,999999117A732600000B9AB6,50-54,not started,,'-','-',-,...,-,'-','-',-,'-','-',-,'-','-',-
4,10000,M,03:14:12,999999117A732600000B93A7,18-39,finished,Finish Net,00:23:17,07:30,8.01,...,8.17,02:40:05,07:38,7.86,03:04:02,07:43,7.79,03:14:12,07:28,8.05


In [15]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2015_full.csv", index=False)

In [19]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2016

In [69]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_16}"

#### Results Pages

In [70]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_16, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 15
Boston 2016 total results pages: 32
Example URLs: 
 https://results.baa.org/2016/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2016/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2016/Boston2016_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: bfe2e82c12d0adfb
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (30743 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2016/Boston2016_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9102,
 'downloader/request_count': 32,
 'downloader/request_method_count/GET': 32,
 'downloader/response_bytes': 2200290,
 'downloader/response_count': 32,
 'downloader/response_status_count/200': 32,
 'elapsed_time_seconds': 9.049999,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 20, 15, 50, 18, 184019),
 'httpcompression/response_bytes': 24161538,
 'httpcompression/response_count': 32,
 'item_scraped_count': 30743,
 'log_count/INFO': 12,
 'memusage/max': 382533632,
 'memusage/startup': 382533632,
 'response_received_count': 32,
 'scheduler/dequeued': 32,
 'scheduler/dequeued/memory': 32,
 'scheduler/enqueued': 32,
 'scheduler/enqueued/memory': 32,
 'start

In [24]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2016_res.csv")

In [25]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,7384,M,03:49:49,999999117A732600000CF738
1,14528,M,,999999117A732600000D270A
2,2064,M,,999999117A732600000D4DEC
3,6801,M,03:37:22,999999117A732600000D2009
4,20302,M,,999999117A732600000D16E6


In [26]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30743 entries, 0 to 30742
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  30743 non-null  object
 1   gender  30743 non-null  object
 2   finish  26648 non-null  object
 3   idp     30743 non-null  object
dtypes: object(4)
memory usage: 960.8+ KB


#### Splits Pages

In [71]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2016_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_16, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2016 total splits pages: 30743
Example URLs: 
 https://results.baa.org/2016/?content=detail&fpid=search&pid=search&idp=999999117A732600000CF738&lang=EN&event=R 
 https://results.baa.org/2016/?content=detail&fpid=search&pid=search&idp=999999117A732600000D048C&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2016/Boston2016_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 46d0717c1286b8f5
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9590 pages (at 9590 pages/min), scraped 9567 items (at 9567 items/min)
INFO: Crawled 19503 pages (at 9913 pages/min), scraped 19483 items (at 9916 items/min)
INFO: Crawled 29389 pages (at 9886 pages/min), scraped 29368 items (at 9885 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (30743 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2016/Boston2016_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9438101,
 'downloader/request_count': 30743,
 'downloader/request_method_count/GET': 30743,
 'downloader/response_bytes': 167623448,
 'downloader/response_count': 30743,
 'downloader/response_status_count/200': 30743,
 'elapsed_time_seconds': 188.343535,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 20, 16, 5, 48, 90457),
 'httpcompression/response_bytes': 514336543,
 'httpcompression/response_count'

In [10]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2016_splits.csv")

In [11]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30743 entries, 0 to 30742
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         30743 non-null  object
 1   age_cat     30743 non-null  object
 2   race_state  30743 non-null  object
 3   last_split  27487 non-null  object
 4   k_5         30743 non-null  object
 5   k_10        30743 non-null  object
 6   k_15        30743 non-null  object
 7   k_20        30743 non-null  object
 8   k_half      30743 non-null  object
 9   k_25        30743 non-null  object
 10  k_30        30743 non-null  object
 11  k_35        30743 non-null  object
 12  k_40        30743 non-null  object
 13  k_finish    30743 non-null  object
dtypes: object(14)
memory usage: 3.3+ MB


#### Full Raw Dataset for Boston 2016

In [12]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2016_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2016_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [13]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30743 entries, 0 to 30742
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      30743 non-null  object
 1   gender      30743 non-null  object
 2   finish      26648 non-null  object
 3   idp         30743 non-null  object
 4   age_cat     30743 non-null  object
 5   race_state  30743 non-null  object
 6   last_split  27487 non-null  object
 7   k_5         30743 non-null  object
 8   k_10        30743 non-null  object
 9   k_15        30743 non-null  object
 10  k_20        30743 non-null  object
 11  k_half      30743 non-null  object
 12  k_25        30743 non-null  object
 13  k_30        30743 non-null  object
 14  k_35        30743 non-null  object
 15  k_40        30743 non-null  object
 16  k_finish    30743 non-null  object
dtypes: object(17)
memory usage: 4.0+ MB


In [14]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [15]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30743 entries, 0 to 30742
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          30743 non-null  object
 1   gender          30743 non-null  object
 2   finish          26648 non-null  object
 3   idp             30743 non-null  object
 4   age_cat         30743 non-null  object
 5   race_state      30743 non-null  object
 6   last_split      27487 non-null  object
 7   k_5_time        30743 non-null  object
 8   k_5_pace        30743 non-null  object
 9   k_5_speed       30743 non-null  object
 10  k_10_time       30743 non-null  object
 11  k_10_pace       30743 non-null  object
 12  k_10_speed      30743 non-null  object
 13  k_15_time       30743 non-null  object
 14  k_15_pace       30743 non-null  object
 15  k_15_speed      30743 non-null  object
 16  k_20_time       30743 non-null  object
 17  k_20_pace       30743 non-null  object
 18  k_20_s

In [16]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:13:32,999999117A732600000D4C2A,18-39,finished,Finish Net,00:15:47,05:05,11.81,...,12.40,01:50:24,05:12,11.57,02:05:59,05:01,11.96,02:13:32,05:33,10.84
1,10,M,,999999117A732600000D4C30,18-39,started,25K,00:15:46,05:05,11.83,...,-,-,-,-,-,-,-,-,-,-
2,100,M,,999999117A732600000CF16D,60-64,not started,,'-','-',-,...,-,'-','-',-,'-','-',-,'-','-',-
3,1000,M,03:07:33,999999117A732600000D5931,18-39,finished,Finish Net,00:19:23,06:15,9.62,...,7.23,02:32:35,07:38,7.87,02:57:08,07:55,7.59,03:07:33,07:39,7.86
4,10001,W,05:45:28,999999117A732600000D17BD,18-39,finished,Finish Net,00:34:54,11:14,5.34,...,4.14,04:44:01,14:47,4.06,05:26:49,13:47,4.36,05:45:28,13:41,4.39


In [18]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2016_full.csv", index=False)

In [19]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2017

In [72]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_17}"

#### Results Pages

In [73]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_17, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 14
Boston 2017 total results pages: 31
Example URLs: 
 https://results.baa.org/2017/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2017/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2017/Boston2017_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 61e22436f8b8f7ba


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (30074 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2017/Boston2017_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8817,
 'downloader/request_count': 31,
 'downloader/request_method_count/GET': 31,
 'downloader/response_bytes': 2169848,
 'downloader/response_count': 31,
 'downloader/response_status_count/200': 31,
 'elapsed_time_seconds': 8.107254,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 20, 16, 39, 32, 993779),
 'httpcompression/response_bytes': 23642865,
 'httpcompression/response_count': 31,
 'item_scraped_count': 30074,
 'log_count/INFO': 12,
 'memusage/max': 331382784,
 'memusage/startup': 331382784,
 'response_received_count': 31,
 'scheduler/dequeued': 31,
 'scheduler/dequeued/memory': 31,
 'scheduler/enqueued': 31,
 'scheduler/enqueued/memory': 31,
 'start

In [23]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2017_res.csv")

In [24]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,22224,M,03:51:31,999999117A732600000F13B5
1,12140,M,04:10:22,999999117A732600000EC50C
2,27473,M,05:54:24,999999117A732600000EE52E
3,3733,M,03:39:29,999999117A732600000EBC6A
4,26135,M,03:53:22,999999117A732600000EEEE0


In [25]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30074 entries, 0 to 30073
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  30074 non-null  object
 1   gender  30074 non-null  object
 2   finish  26413 non-null  object
 3   idp     30074 non-null  object
dtypes: object(4)
memory usage: 939.9+ KB


#### Splits Pages

In [74]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2017_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_17, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2017 total splits pages: 30074
Example URLs: 
 https://results.baa.org/2017/?content=detail&fpid=search&pid=search&idp=999999117A732600000F13B5&lang=EN&event=R 
 https://results.baa.org/2017/?content=detail&fpid=search&pid=search&idp=999999117A732600000F13D5&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2017/Boston2017_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1417, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: aa57b28cc03f1579


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 9134 pages (at 9134 pages/min), scraped 9108 items (at 9108 items/min)
INFO: Crawled 18411 pages (at 9277 pages/min), scraped 18388 items (at 9280 items/min)
INFO: Crawled 27366 pages (at 8955 pages/min), scraped 27350 items (at 8962 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (30074 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2017/Boston2017_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9232718,
 'downloader/request_count': 30074,
 'downloader/request_method_count/GET': 30074,
 'downloader/response_bytes': 164129284,
 'downloader/response_count': 30074,
 'downloader/response_status_count/200': 30074,
 'elapsed_time_seconds': 199.126759,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 20, 16, 45, 9, 817488),
 'httpcompression/response_bytes': 503347129,
 'httpcompression/response_count

In [28]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2017_splits.csv")

In [29]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30074 entries, 0 to 30073
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         30074 non-null  object
 1   age_cat     30074 non-null  object
 2   race_state  30074 non-null  object
 3   last_split  27220 non-null  object
 4   k_5         30074 non-null  object
 5   k_10        30074 non-null  object
 6   k_15        30074 non-null  object
 7   k_20        30074 non-null  object
 8   k_half      30074 non-null  object
 9   k_25        30074 non-null  object
 10  k_30        30074 non-null  object
 11  k_35        30074 non-null  object
 12  k_40        30074 non-null  object
 13  k_finish    30074 non-null  object
dtypes: object(14)
memory usage: 3.2+ MB


#### Full Raw Dataset for Boston 2017

In [30]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2017_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2017_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [31]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30074 entries, 0 to 30073
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      30074 non-null  object
 1   gender      30074 non-null  object
 2   finish      26413 non-null  object
 3   idp         30074 non-null  object
 4   age_cat     30074 non-null  object
 5   race_state  30074 non-null  object
 6   last_split  27220 non-null  object
 7   k_5         30074 non-null  object
 8   k_10        30074 non-null  object
 9   k_15        30074 non-null  object
 10  k_20        30074 non-null  object
 11  k_half      30074 non-null  object
 12  k_25        30074 non-null  object
 13  k_30        30074 non-null  object
 14  k_35        30074 non-null  object
 15  k_40        30074 non-null  object
 16  k_finish    30074 non-null  object
dtypes: object(17)
memory usage: 3.9+ MB


In [32]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [33]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30074 entries, 0 to 30073
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          30074 non-null  object
 1   gender          30074 non-null  object
 2   finish          26413 non-null  object
 3   idp             30074 non-null  object
 4   age_cat         30074 non-null  object
 5   race_state      30074 non-null  object
 6   last_split      27220 non-null  object
 7   k_5_time        30074 non-null  object
 8   k_5_pace        30074 non-null  object
 9   k_5_speed       30074 non-null  object
 10  k_10_time       30074 non-null  object
 11  k_10_pace       30074 non-null  object
 12  k_10_speed      30074 non-null  object
 13  k_15_time       30074 non-null  object
 14  k_15_pace       30074 non-null  object
 15  k_15_speed      30074 non-null  object
 16  k_20_time       30074 non-null  object
 17  k_20_pace       30074 non-null  object
 18  k_20_s

In [34]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,,999999117A732600000EF87C,18-39,started,30K,00:15:25,04:58,12.09,...,11.63,-,-,-,-,-,-,-,-,-
1,10,M,02:18:14,999999117A732600000EF881,18-39,finished,Finish Net,00:15:24,04:58,12.11,...,10.87,01:51:50,05:39,10.62,02:09:58,05:51,10.28,02:18:14,06:04,9.90
2,100,M,,999999117A732600000EA111,60-64,not started,,'-','-',-,...,-,'-','-',-,'-','-',-,'-','-',-
3,1000,W,03:11:26,999999117A732600000EA5AB,18-39,finished,Finish Net,00:20:48,06:42,8.96,...,7.87,02:36:15,08:01,7.49,03:00:24,07:47,7.72,03:11:26,08:06,7.42
4,10000,M,03:28:16,999999117A732600000EBE32,45-49,finished,Finish Net,00:22:56,07:23,8.13,...,7.40,02:47:02,08:47,6.83,03:17:13,09:43,6.18,03:28:16,08:06,7.41


In [35]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2017_full.csv", index=False)

In [36]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2018

In [55]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_18}"

#### Results Pages

In [56]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_18, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 14
Boston 2018 total results pages: 31
Example URLs: 
 https://results.baa.org/2018/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2018/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2018/Boston2018_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [57]:
run_spider(boston_spiders.Boston1823, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 85d689b5fd411b4e


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (29978 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2018/Boston2018_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8817,
 'downloader/request_count': 31,
 'downloader/request_method_count/GET': 31,
 'downloader/response_bytes': 3259448,
 'downloader/response_count': 31,
 'downloader/response_status_count/200': 31,
 'elapsed_time_seconds': 29.584118,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 34, 45, 928554),
 'httpcompression/response_bytes': 64895161,
 'httpcompression/response_count': 31,
 'item_scraped_count': 29978,
 'log_count/INFO': 12,
 'memusage/max': 686145536,
 'memusage/startup': 686145536,
 'response_received_count': 31,
 'scheduler/dequeued': 31,
 'scheduler/dequeued/memory': 31,
 'scheduler/enqueued': 31,
 'scheduler/enqueued/memory': 31,
 'star

In [11]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2018_res.csv")

In [12]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,14836,M,03:14:44,999999117A7326000010D4EA
1,2491,M,02:45:34,999999117A73260000107DF7
2,515,M,02:42:07,999999117A7326000010C532
3,22268,M,03:54:42,999999117A73260000107174
4,7033,M,03:24:47,999999117A7326000010B9B6


In [20]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29978 entries, 0 to 29977
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  29978 non-null  object
 1   gender  29978 non-null  object
 2   finish  25752 non-null  object
 3   idp     29978 non-null  object
dtypes: object(4)
memory usage: 936.9+ KB


#### Splits Pages

In [58]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2018_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_18, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2018 total splits pages: 29978
Example URLs: 
 https://results.baa.org/2018/?content=detail&fpid=search&pid=search&idp=999999117A7326000010C0F2&lang=EN&event=R 
 https://results.baa.org/2018/?content=detail&fpid=search&pid=search&idp=999999117A7326000010B4B3&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2018/Boston2018_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 1334d9ef42d219d7
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (100 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2018/Boston2018_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 30700,
 'downloader/request_count': 100,
 'downloader/request_method_count/GET': 100,
 'downloader/response_bytes': 727731,
 'downloader/response_count': 100,
 'downloader/response_status_count/200': 100,
 'elapsed_time_seconds': 1.797058,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 35, 6, 421686),
 'httpcompression/response_bytes': 2448337,
 'httpcompression/response_count': 100,
 'item_scraped_count': 100,
 'log_count/INFO': 12,
 'memusage/max': 867778560,
 'memusage/startup': 867778560,
 'response_received_count': 100,
 'scheduler/dequeued': 100,
 'scheduler/dequeued/memory': 100,
 'scheduler/enqueued': 100,
 'scheduler/enqueued/memory': 100,


In [12]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2018_splits.csv")

In [13]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29978 entries, 0 to 29977
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         29978 non-null  object
 1   age_cat     29978 non-null  object
 2   race_state  29955 non-null  object
 3   last_split  26925 non-null  object
 4   k_5         29978 non-null  object
 5   k_10        29978 non-null  object
 6   k_15        29978 non-null  object
 7   k_20        29978 non-null  object
 8   k_half      29978 non-null  object
 9   k_25        29978 non-null  object
 10  k_30        29978 non-null  object
 11  k_35        29978 non-null  object
 12  k_40        29978 non-null  object
 13  k_finish    29978 non-null  object
dtypes: object(14)
memory usage: 3.2+ MB


#### Full Raw Dataset for Boston 2018

In [18]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2018_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2018_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [19]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29978 entries, 0 to 29977
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      29978 non-null  object
 1   gender      29978 non-null  object
 2   finish      25752 non-null  object
 3   idp         29978 non-null  object
 4   age_cat     29978 non-null  object
 5   race_state  29955 non-null  object
 6   last_split  26925 non-null  object
 7   k_5         29978 non-null  object
 8   k_10        29978 non-null  object
 9   k_15        29978 non-null  object
 10  k_20        29978 non-null  object
 11  k_half      29978 non-null  object
 12  k_25        29978 non-null  object
 13  k_30        29978 non-null  object
 14  k_35        29978 non-null  object
 15  k_40        29978 non-null  object
 16  k_finish    29978 non-null  object
dtypes: object(17)
memory usage: 3.9+ MB


In [20]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [21]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29978 entries, 0 to 29977
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          29978 non-null  object
 1   gender          29978 non-null  object
 2   finish          25752 non-null  object
 3   idp             29978 non-null  object
 4   age_cat         29978 non-null  object
 5   race_state      29955 non-null  object
 6   last_split      26925 non-null  object
 7   k_5_time        29978 non-null  object
 8   k_5_pace        29978 non-null  object
 9   k_5_speed       29978 non-null  object
 10  k_10_time       29978 non-null  object
 11  k_10_pace       29978 non-null  object
 12  k_10_speed      29978 non-null  object
 13  k_15_time       29978 non-null  object
 14  k_15_pace       29978 non-null  object
 15  k_15_speed      29978 non-null  object
 16  k_20_time       29978 non-null  object
 17  k_20_pace       29978 non-null  object
 18  k_20_s

In [23]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:18:23,999999117A7326000010C08A,18-39,Finished,Finish Net,00:15:07,04:52,12.33,...,11.95,01:50:49,04:57,12.13,02:08:22,05:39,10.62,02:18:23,07:10,8.38
1,10,M,,999999117A7326000010C0A0,18-39,Not Started,,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,100,M,,999999117A7326000010A61F,60-64,Not Started,,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,1000,M,05:37:39,999999117A7326000010CFDC,45-49,Finished,Finish Net,00:18:57,06:06,9.84,...,-,03:51:39,14:53,4.03,05:10:09,25:16,2.38,05:37:39,19:34,3.07
4,10000,M,,999999117A73260000107F1D,45-49,Not Started,,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [24]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2018_full.csv", index=False)

In [None]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2019

In [78]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_19}"

#### Results Pages

In [79]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_19, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 14
Boston 2019 total results pages: 31
Example URLs: 
 https://results.baa.org/2019/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2019/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2019/Boston2019_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 9e59ad93e36859a8
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (30234 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2019/Boston2019_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8817,
 'downloader/request_count': 31,
 'downloader/request_method_count/GET': 31,
 'downloader/response_bytes': 3291432,
 'downloader/response_count': 31,
 'downloader/response_status_count/200': 31,
 'elapsed_time_seconds': 17.033218,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 16, 57, 20, 866114),
 'httpcompression/response_bytes': 65304400,
 'httpcompression/response_count': 31,
 'item_scraped_count': 30234,
 'log_count/INFO': 12,
 'memusage/max': 241385472,
 'memusage/startup': 241369088,
 'response_received_count': 31,
 'scheduler/dequeued': 31,
 'scheduler/dequeued/memory': 31,
 'scheduler/enqueued': 31,
 'scheduler/enqueued/memory': 31,
 'star

In [10]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2019_res.csv")

In [11]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,2245,M,04:05:22,999999117A73270000121513
1,28431,M,04:01:42,999999117A7327000011F8B3
2,411,M,02:38:27,999999117A7327000011FA14
3,3781,M,03:13:29,999999117A73270000121E6E
4,5795,M,03:34:54,999999117A732700001266AD


In [12]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30234 entries, 0 to 30233
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  30234 non-null  object
 1   gender  30234 non-null  object
 2   finish  26656 non-null  object
 3   idp     30234 non-null  object
dtypes: object(4)
memory usage: 944.9+ KB


#### Splits Pages

In [80]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2019_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_19, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2019 total splits pages: 30234
Example URLs: 
 https://results.baa.org/2019/?content=detail&fpid=search&pid=search&idp=999999117A73270000121513&lang=EN&event=R 
 https://results.baa.org/2019/?content=detail&fpid=search&pid=search&idp=999999117A73270000123F5C&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2019/Boston2019_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: a5e77afd3918d047
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8724 pages (at 8724 pages/min), scraped 8708 items (at 8708 items/min)
INFO: Crawled 17474 pages (at 8750 pages/min), scraped 17456 items (at 8748 items/min)
INFO: Crawled 26274 pages (at 8800 pages/min), scraped 26254 items (at 8798 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (30234 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2019/Boston2019_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 9281838,
 'downloader/request_count': 30234,
 'downloader/request_method_count/GET': 30234,
 'downloader/response_bytes': 220322094,
 'downloader/response_count': 30234,
 'downloader/response_status_count/200': 30234,
 'elapsed_time_seconds': 206.689874,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 15, 59, 19, 653115),
 'httpcompression/response_bytes': 738276343,
 'httpcompression/response_coun

In [44]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2019_splits.csv")

In [45]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30234 entries, 0 to 30233
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         30234 non-null  object
 1   age_cat     30234 non-null  object
 2   race_state  30234 non-null  object
 3   last_split  27339 non-null  object
 4   k_5         30234 non-null  object
 5   k_10        30234 non-null  object
 6   k_15        30234 non-null  object
 7   k_20        30234 non-null  object
 8   k_half      30234 non-null  object
 9   k_25        30234 non-null  object
 10  k_30        30234 non-null  object
 11  k_35        30234 non-null  object
 12  k_40        30234 non-null  object
 13  k_finish    30234 non-null  object
dtypes: object(14)
memory usage: 3.2+ MB


#### Full Raw Dataset for Boston 2019

In [49]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2019_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2019_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [50]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30234 entries, 0 to 30233
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      30234 non-null  object
 1   gender      30234 non-null  object
 2   finish      26656 non-null  object
 3   idp         30234 non-null  object
 4   age_cat     30234 non-null  object
 5   race_state  30234 non-null  object
 6   last_split  27339 non-null  object
 7   k_5         30234 non-null  object
 8   k_10        30234 non-null  object
 9   k_15        30234 non-null  object
 10  k_20        30234 non-null  object
 11  k_half      30234 non-null  object
 12  k_25        30234 non-null  object
 13  k_30        30234 non-null  object
 14  k_35        30234 non-null  object
 15  k_40        30234 non-null  object
 16  k_finish    30234 non-null  object
dtypes: object(17)
memory usage: 3.9+ MB


In [51]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [52]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30234 entries, 0 to 30233
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          30234 non-null  object
 1   gender          30234 non-null  object
 2   finish          26656 non-null  object
 3   idp             30234 non-null  object
 4   age_cat         30234 non-null  object
 5   race_state      30234 non-null  object
 6   last_split      27339 non-null  object
 7   k_5_time        30234 non-null  object
 8   k_5_pace        30234 non-null  object
 9   k_5_speed       30234 non-null  object
 10  k_10_time       30234 non-null  object
 11  k_10_pace       30234 non-null  object
 12  k_10_speed      30234 non-null  object
 13  k_15_time       30234 non-null  object
 14  k_15_pace       30234 non-null  object
 15  k_15_speed      30234 non-null  object
 16  k_20_time       30234 non-null  object
 17  k_20_pace       30234 non-null  object
 18  k_20_s

In [53]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:15:29,999999117A73270000124C3D,18-39,Finished,Finish Net,00:15:12,04:54,12.27,...,11.18,01:52:20,05:08,11.71,02:08:34,05:14,11.49,02:15:29,04:57,12.15
1,10,M,02:09:25,999999117A73270000124CA1,18-39,Finished,Finish Net,00:15:13,04:54,12.25,...,11.94,01:47:17,04:44,12.71,02:02:17,04:50,12.43,02:09:25,05:10,11.65
2,100,M,,999999117A73290000126C01,60-64,Not Started,,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,1000,M,03:16:08,999999117A73270000123F34,18-39,Finished,Finish Net,00:19:51,06:24,9.39,...,8.17,02:36:25,10:41,5.62,03:03:43,08:48,6.83,03:16:08,08:44,6.87
4,10000,W,03:25:12,999999117A73270000125F65,55-59,Finished,Finish Net,00:21:51,07:02,8.53,...,7.31,02:44:12,08:47,6.84,03:12:51,09:14,6.51,03:25:12,08:48,6.82


In [54]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2019_full.csv", index=False)

In [55]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2021

In [81]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_21}"

#### Results Pages

In [82]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_21, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 10 || Women Pages: 9
Boston 2021 total results pages: 19
Example URLs: 
 https://results.baa.org/2021/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2021/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2021/Boston2021_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 204b27ccd4861695


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (18074 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2021/Boston2021_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5397,
 'downloader/request_count': 19,
 'downloader/request_method_count/GET': 19,
 'downloader/response_bytes': 1884738,
 'downloader/response_count': 19,
 'downloader/response_status_count/200': 19,
 'elapsed_time_seconds': 15.104074,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 16, 58, 9, 122743),
 'httpcompression/response_bytes': 36248627,
 'httpcompression/response_count': 19,
 'item_scraped_count': 18074,
 'log_count/INFO': 12,
 'memusage/max': 602669056,
 'memusage/startup': 602669056,
 'response_received_count': 19,
 'scheduler/dequeued': 19,
 'scheduler/dequeued/memory': 19,
 'scheduler/enqueued': 19,
 'scheduler/enqueued/memory': 19,
 'start

In [16]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2021_res.csv")

In [17]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,16574,M,05:57:22,9TGHS6FF144DBE
1,14005,M,,9TGHS6FF1440CE
2,19590,M,03:56:31,9TGHS6FF145F55
3,12109,M,,9TGHS6FF14706A
4,10969,M,03:34:43,9TGHS6FF1477CB


In [18]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18074 entries, 0 to 18073
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  18074 non-null  int64 
 1   gender  18074 non-null  object
 2   finish  15401 non-null  object
 3   idp     18074 non-null  object
dtypes: int64(1), object(3)
memory usage: 564.9+ KB


#### Splits Pages

In [83]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2021_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_21, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2021 total splits pages: 18074
Example URLs: 
 https://results.baa.org/2021/?content=detail&fpid=search&pid=search&idp=9TGHS6FF144DBE&lang=EN&event=R 
 https://results.baa.org/2021/?content=detail&fpid=search&pid=search&idp=9TGHS6FF1470EA&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2021/Boston2021_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: c6cd274598660d18
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8151 pages (at 8151 pages/min), scraped 8135 items (at 8135 items/min)
INFO: Crawled 16370 pages (at 8219 pages/min), scraped 16354 items (at 8219 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (18074 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2021/Boston2021_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 5367978,
 'downloader/request_count': 18074,
 'downloader/request_method_count/GET': 18074,
 'downloader/response_bytes': 134978979,
 'downloader/response_count': 18074,
 'downloader/response_status_count/200': 18074,
 'elapsed_time_seconds': 132.738735,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 16, 19, 33, 416369),
 'httpcompression/response_bytes': 454228728,
 'httpcompression/response_count': 18074,
 'item_scraped_count': 18074,
 'log_count/INFO': 14,
 'memusage/max': 843579

In [65]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2021_splits.csv")

In [66]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18074 entries, 0 to 18073
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         18074 non-null  object
 1   age_cat     18074 non-null  object
 2   race_state  18074 non-null  object
 3   last_split  15648 non-null  object
 4   k_5         18074 non-null  object
 5   k_10        18074 non-null  object
 6   k_15        18074 non-null  object
 7   k_20        18074 non-null  object
 8   k_half      18074 non-null  object
 9   k_25        18074 non-null  object
 10  k_30        18074 non-null  object
 11  k_35        18074 non-null  object
 12  k_40        18074 non-null  object
 13  k_finish    18074 non-null  object
dtypes: object(14)
memory usage: 1.9+ MB


#### Full Raw Dataset for Boston 2021

In [19]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2021_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2021_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [20]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18074 entries, 0 to 18073
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      18074 non-null  int64 
 1   gender      18074 non-null  object
 2   finish      15401 non-null  object
 3   idp         18074 non-null  object
 4   age_cat     18074 non-null  object
 5   race_state  18074 non-null  object
 6   last_split  15648 non-null  object
 7   k_5         18074 non-null  object
 8   k_10        18074 non-null  object
 9   k_15        18074 non-null  object
 10  k_20        18074 non-null  object
 11  k_half      18074 non-null  object
 12  k_25        18074 non-null  object
 13  k_30        18074 non-null  object
 14  k_35        18074 non-null  object
 15  k_40        18074 non-null  object
 16  k_finish    18074 non-null  object
dtypes: int64(1), object(16)
memory usage: 2.3+ MB


In [21]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [22]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18074 entries, 0 to 18073
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          18074 non-null  int64 
 1   gender          18074 non-null  object
 2   finish          15401 non-null  object
 3   idp             18074 non-null  object
 4   age_cat         18074 non-null  object
 5   race_state      18074 non-null  object
 6   last_split      15648 non-null  object
 7   k_5_time        18074 non-null  object
 8   k_5_pace        18074 non-null  object
 9   k_5_speed       18074 non-null  object
 10  k_10_time       18074 non-null  object
 11  k_10_pace       18074 non-null  object
 12  k_10_speed      18074 non-null  object
 13  k_15_time       18074 non-null  object
 14  k_15_pace       18074 non-null  object
 15  k_15_speed      18074 non-null  object
 16  k_20_time       18074 non-null  object
 17  k_20_pace       18074 non-null  object
 18  k_20_s

In [23]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,,9TGHS6FF145C89,18-39,Did Not Finish,25.2 Miles,00:15:28,04:59,12.05,...,11.88,01:50:16,05:36,10.73,02:09:14,06:07,9.83,-,-,-
1,2,M,02:12:11,9TGHS6FF145DFF,18-39,Finished,Finish Net,00:15:30,05:00,12.03,...,11.88,01:49:26,04:45,12.65,02:04:37,04:54,12.28,02:12:11,05:24,11.12
2,3,M,02:10:37,9TGHS6FF145C8A,18-39,Finished,Finish Net,00:15:31,05:00,12.02,...,11.88,01:49:26,04:44,12.71,02:04:08,04:44,12.68,02:10:37,04:34,13.15
3,4,M,02:09:51,9TGHS6FF145C88,18-39,Finished,Finish Net,00:15:29,04:59,12.04,...,11.89,01:49:26,04:45,12.65,02:03:31,04:32,13.24,02:09:51,04:29,13.39
4,5,M,02:11:40,9TGHS6FF145C87,18-39,Finished,Finish Net,00:15:30,05:00,12.03,...,11.88,01:49:25,04:44,12.71,02:04:35,04:53,12.29,02:11:40,05:03,11.91


In [24]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2021_full.csv", index=False)

In [25]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2022

In [84]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_22}"

#### Results Pages

In [85]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_22, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 17 || Women Pages: 13
Boston 2022 total results pages: 30
Example URLs: 
 https://results.baa.org/2022/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2022/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2022/Boston2022_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 8056ed0b5fcb55b0


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (28500 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2022/Boston2022_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8532,
 'downloader/request_count': 30,
 'downloader/request_method_count/GET': 30,
 'downloader/response_bytes': 2979391,
 'downloader/response_count': 30,
 'downloader/response_status_count/200': 30,
 'elapsed_time_seconds': 26.197305,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 16, 59, 29, 757741),
 'httpcompression/response_bytes': 57146384,
 'httpcompression/response_count': 30,
 'item_scraped_count': 28500,
 'log_count/INFO': 12,
 'memusage/max': 602669056,
 'memusage/startup': 602669056,
 'response_received_count': 30,
 'scheduler/dequeued': 30,
 'scheduler/dequeued/memory': 30,
 'scheduler/enqueued': 30,
 'scheduler/enqueued/memory': 30,
 'star

In [29]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2022_res.csv")

In [30]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,1415,M,02:51:49,9TGHS6FF164F3E
1,17499,M,03:44:23,9TGHS6FF16116F
2,11218,M,03:06:12,9TGHS6FF161E3D
3,427,M,02:28:39,9TGHS6FF16225E
4,27441,M,04:53:35,9TGHS6FF162F12


In [31]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28500 entries, 0 to 28499
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  28500 non-null  int64 
 1   gender  28500 non-null  object
 2   finish  24859 non-null  object
 3   idp     28500 non-null  object
dtypes: int64(1), object(3)
memory usage: 890.8+ KB


#### Splits Pages

In [86]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2022_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_22, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2022 total splits pages: 28500
Example URLs: 
 https://results.baa.org/2022/?content=detail&fpid=search&pid=search&idp=9TGHS6FF164F3E&lang=EN&event=R 
 https://results.baa.org/2022/?content=detail&fpid=search&pid=search&idp=9TGHS6FF165117&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2022/Boston2022_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: a0e0e6111f14e12b


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8227 pages (at 8227 pages/min), scraped 8217 items (at 8217 items/min)
INFO: Crawled 16530 pages (at 8303 pages/min), scraped 16514 items (at 8297 items/min)
INFO: Crawled 24958 pages (at 8428 pages/min), scraped 24941 items (at 8427 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (28500 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2022/Boston2022_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8464500,
 'downloader/request_count': 28500,
 'downloader/request_method_count/GET': 28500,
 'downloader/response_bytes': 206514729,
 'downloader/response_count': 28500,
 'downloader/response_status_count/200': 28500,
 'elapsed_time_seconds': 205.840549,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 16, 42, 13, 54187),
 'httpcompression/response_bytes': 694126858,
 'httpcompression/response_count

In [83]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2022_splits.csv")

In [84]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28500 entries, 0 to 28499
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         28500 non-null  object
 1   age_cat     28500 non-null  object
 2   race_state  28500 non-null  object
 3   last_split  25220 non-null  object
 4   k_5         28500 non-null  object
 5   k_10        28500 non-null  object
 6   k_15        28500 non-null  object
 7   k_20        28500 non-null  object
 8   k_half      28500 non-null  object
 9   k_25        28500 non-null  object
 10  k_30        28500 non-null  object
 11  k_35        28500 non-null  object
 12  k_40        28500 non-null  object
 13  k_finish    28500 non-null  object
dtypes: object(14)
memory usage: 3.0+ MB


#### Full Raw Dataset for Boston 2022

In [32]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2022_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2022_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [33]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28500 entries, 0 to 28499
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      28500 non-null  int64 
 1   gender      28500 non-null  object
 2   finish      24859 non-null  object
 3   idp         28500 non-null  object
 4   age_cat     28500 non-null  object
 5   race_state  28500 non-null  object
 6   last_split  25220 non-null  object
 7   k_5         28500 non-null  object
 8   k_10        28500 non-null  object
 9   k_15        28500 non-null  object
 10  k_20        28500 non-null  object
 11  k_half      28500 non-null  object
 12  k_25        28500 non-null  object
 13  k_30        28500 non-null  object
 14  k_35        28500 non-null  object
 15  k_40        28500 non-null  object
 16  k_finish    28500 non-null  object
dtypes: int64(1), object(16)
memory usage: 3.7+ MB


In [34]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [35]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28500 entries, 0 to 28499
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          28500 non-null  int64 
 1   gender          28500 non-null  object
 2   finish          24859 non-null  object
 3   idp             28500 non-null  object
 4   age_cat         28500 non-null  object
 5   race_state      28500 non-null  object
 6   last_split      25220 non-null  object
 7   k_5_time        28500 non-null  object
 8   k_5_pace        28500 non-null  object
 9   k_5_speed       28500 non-null  object
 10  k_10_time       28500 non-null  object
 11  k_10_pace       28500 non-null  object
 12  k_10_speed      28500 non-null  object
 13  k_15_time       28500 non-null  object
 14  k_15_pace       28500 non-null  object
 15  k_15_speed      28500 non-null  object
 16  k_20_time       28500 non-null  object
 17  k_20_pace       28500 non-null  object
 18  k_20_s

In [36]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:07:27,9TGHS6FF165C93,18-39,Finished,Finish Net,00:14:58,04:49,12.46,...,12.02,01:46:33,04:29,13.4,02:00:45,04:35,13.13,02:07:27,04:51,12.40
1,3,M,,9TGHS6FF165CE8,18-39,Did Not Finish,35K,00:14:59,04:50,12.44,...,12.0,01:46:39,04:37,13.01,-,-,-,-,-,-
2,4,M,02:06:51,9TGHS6FF165CBE,18-39,Finished,Finish Net,00:14:58,04:49,12.46,...,11.99,01:46:31,04:26,13.54,02:00:26,04:29,13.40,02:06:51,04:36,13.06
3,5,M,02:07:21,9TGHS6FF165CBD,18-39,Finished,Finish Net,00:14:58,04:49,12.46,...,12.0,01:46:33,04:29,13.4,02:00:44,04:34,13.15,02:07:21,04:48,12.52
4,7,M,02:09:43,9TGHS6FF165CBF,18-39,Finished,Finish Net,00:14:59,04:50,12.44,...,12.02,01:46:37,04:34,13.14,02:02:24,05:05,11.81,02:09:43,05:14,11.47


In [37]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2022_full.csv", index=False)

In [38]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

### 2023

In [87]:
bos_data_path = f"Marathons_Data/Raw/Boston/Boston{YEAR_23}"

#### Results Pages

In [88]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
bos_pages_urls, bos_res_settings = boston.gen_res_scrap_info(YEAR_23, BOS_NUM_RESULTS, BOS_RES_FIELDS, 
                                                             bos_data_path, show_settings=True)

Men Pages: 18 || Women Pages: 13
Boston 2023 total results pages: 31
Example URLs: 
 https://results.baa.org/2023/?pid=search&pidp=start&page=1&sex=M&num_results=1000&event=R 
 https://results.baa.org/2023/?pid=search&pidp=start&page=1&sex=W&num_results=1000&event=R
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2023/Boston2023_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, bos_pages_urls, bos_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: e1707419fca172ad


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (30105 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2023/Boston2023_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8817,
 'downloader/request_count': 31,
 'downloader/request_method_count/GET': 31,
 'downloader/response_bytes': 3065458,
 'downloader/response_count': 31,
 'downloader/response_status_count/200': 31,
 'elapsed_time_seconds': 23.807689,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 17, 1, 7, 495154),
 'httpcompression/response_bytes': 60116515,
 'httpcompression/response_count': 31,
 'item_scraped_count': 30105,
 'log_count/INFO': 12,
 'memusage/max': 885719040,
 'memusage/startup': 885719040,
 'response_received_count': 31,
 'scheduler/dequeued': 31,
 'scheduler/dequeued/memory': 31,
 'scheduler/enqueued': 31,
 'scheduler/enqueued/memory': 31,
 'start_

In [44]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2023_res.csv")

In [45]:
df_bos_res.head()

Unnamed: 0,run_no,gender,finish,idp
0,5446,M,02:56:40,9TGHS6FF1829D8
1,2861,M,02:41:01,9TGHS6FF17E41A
2,2300,M,02:48:59,9TGHS6FF181F7F
3,25447,M,04:08:24,9TGHS6FF181C06
4,3888,M,02:51:30,9TGHS6FF182E78


In [46]:
df_bos_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30105 entries, 0 to 30104
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   run_no  30105 non-null  object
 1   gender  30105 non-null  object
 2   finish  26638 non-null  object
 3   idp     30105 non-null  object
dtypes: object(4)
memory usage: 940.9+ KB


#### Splits Pages

In [89]:
df_bos_res = pd.read_csv(bos_data_path+f"/{BOS_NAME}2023_res.csv")
bos_splits_urls, bos_splits_settings = boston.gen_splits_scrap_info(YEAR_23, df_bos_res["idp"].to_list(), BOS_SPLITS_FIELDS, 
                                                                    bos_data_path, show_settings=True)

Boston 2023 total splits pages: 30105
Example URLs: 
 https://results.baa.org/2023/?content=detail&fpid=search&pid=search&idp=9TGHS6FF1829D8&lang=EN&event=R 
 https://results.baa.org/2023/?content=detail&fpid=search&pid=search&idp=9TGHS6FF18130D&lang=EN&event=R
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2023/Boston2023_splits.csv': {'format': 'csv', 'fields': ['idp', 'age_cat', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(boston_spiders.Boston1823, urls=bos_splits_urls, settings=bos_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 9981e30cfce302d5
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8375 pages (at 8375 pages/min), scraped 8362 items (at 8362 items/min)
INFO: Crawled 16839 pages (at 8464 pages/min), scraped 16824 items (at 8462 items/min)
INFO: Crawled 25309 pages (at 8470 pages/min), scraped 25295 items (at 8471 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (30105 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Boston/Boston2023/Boston2023_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 8941185,
 'downloader/request_count': 30105,
 'downloader/request_method_count/GET': 30105,
 'downloader/response_bytes': 231546190,
 'downloader/response_count': 30105,
 'downloader/response_status_count/200': 30105,
 'elapsed_time_seconds': 212.483682,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 21, 17, 9, 34, 249956),
 'httpcompression/response_bytes': 797701719,
 'httpcompression/response_count

In [51]:
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2023_splits.csv")

In [52]:
df_bos_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30105 entries, 0 to 30104
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         30105 non-null  object
 1   age_cat     30105 non-null  object
 2   race_state  30105 non-null  object
 3   last_split  27064 non-null  object
 4   k_5         30105 non-null  object
 5   k_10        30105 non-null  object
 6   k_15        30105 non-null  object
 7   k_20        30105 non-null  object
 8   k_half      30105 non-null  object
 9   k_25        30105 non-null  object
 10  k_30        30105 non-null  object
 11  k_35        30105 non-null  object
 12  k_40        30105 non-null  object
 13  k_finish    30105 non-null  object
dtypes: object(14)
memory usage: 3.2+ MB


#### Full Raw Dataset for Boston 2023

In [53]:
df_bos_res    = pd.read_csv(bos_data_path+f"/{BOS_NAME}2023_res.csv")
df_bos_splits = pd.read_csv(bos_data_path+f"/{BOS_NAME}2023_splits.csv")
df_bos_full   = pd.merge(df_bos_res, df_bos_splits, on="idp")

In [54]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30105 entries, 0 to 30104
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      30105 non-null  object
 1   gender      30105 non-null  object
 2   finish      26638 non-null  object
 3   idp         30105 non-null  object
 4   age_cat     30105 non-null  object
 5   race_state  30105 non-null  object
 6   last_split  27064 non-null  object
 7   k_5         30105 non-null  object
 8   k_10        30105 non-null  object
 9   k_15        30105 non-null  object
 10  k_20        30105 non-null  object
 11  k_half      30105 non-null  object
 12  k_25        30105 non-null  object
 13  k_30        30105 non-null  object
 14  k_35        30105 non-null  object
 15  k_40        30105 non-null  object
 16  k_finish    30105 non-null  object
dtypes: object(17)
memory usage: 3.9+ MB


In [55]:
df_bos_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/mile) , and speed(miles/h)
df_bos_full = expand_splits(df_bos_full)
# Drop the splits Lists.
df_bos_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [56]:
df_bos_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30105 entries, 0 to 30104
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          30105 non-null  object
 1   gender          30105 non-null  object
 2   finish          26638 non-null  object
 3   idp             30105 non-null  object
 4   age_cat         30105 non-null  object
 5   race_state      30105 non-null  object
 6   last_split      27064 non-null  object
 7   k_5_time        30105 non-null  object
 8   k_5_pace        30105 non-null  object
 9   k_5_speed       30105 non-null  object
 10  k_10_time       30105 non-null  object
 11  k_10_pace       30105 non-null  object
 12  k_10_speed      30105 non-null  object
 13  k_15_time       30105 non-null  object
 14  k_15_pace       30105 non-null  object
 15  k_15_speed      30105 non-null  object
 16  k_20_time       30105 non-null  object
 17  k_20_pace       30105 non-null  object
 18  k_20_s

In [57]:
df_bos_full.head()

Unnamed: 0,run_no,gender,finish,idp,age_cat,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,02:05:54,9TGHS6FF181D5A,18-39,Finished,Finish Net,00:14:18,04:37,13.04,...,12.17,01:44:19,04:33,13.21,01:59:14,04:52,12.35,02:05:54,04:45,12.65
1,10,M,,9TGHS6FF181D49,18-39,Started,HALF,00:14:28,04:40,12.89,...,-,-,-,-,-,-,-,-,-,-
2,100,M,,9TGHS6FF182CE1,65-69,Started,,-,-,-,...,-,-,-,-,-,-,-,-,-,-
3,1000,M,02:47:50,9TGHS6FF183CA2,18-39,Finished,Finish Net,00:19:55,06:25,9.36,...,9.19,02:18:57,06:15,9.62,02:38:57,06:22,9.45,02:47:50,06:20,9.48
4,10000,W,03:06:42,9TGHS6FF17E7F0,18-39,Finished,Finish Net,00:20:56,06:45,8.91,...,8.27,02:30:44,07:03,8.52,02:54:52,07:52,7.63,03:06:42,08:36,6.99


In [58]:
df_bos_full.to_csv(bos_data_path+f"/{BOS_NAME}2023_full.csv", index=False)

In [59]:
del df_bos_full, df_bos_res, df_bos_splits, bos_data_path

## Chicago

In [60]:
CHI_NAME = "Chicago"

# Chicago URLs Templates.
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page. || {4}: Event Id (To select only marathon)
CHICAGO_MARATHON_URL_BF_22: str = "https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group={0}&page={1}&sex={2}&num_results={3}&event={4}"
# {0}: Year || {1}: Page Number || {2}: sex || {3}: Number of results per page.
CHICAGO_MARATHON_URL_AF_22: str = "https://results.chicagomarathon.com/{0}/?pid=search&page={1}&sex={2}&num_results={3}&event=MAR"

# {0}: Event Id || {1}: runner id
CHICAGO_MARATHON_SPLIT_URL_BF_22: str = "https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event={0}&idp={1}&lang=EN_CAP&lang=EN_CAP"
# {0}: Year || {1}: runner id
CHICAGO_MARATHON_SPLIT_URL_AF_22: str = "https://results.chicagomarathon.com/{0}/?content=detail&fpid=search&pid=search&idp={1}&lang=EN_CAP"

CHI_NUM_RESULTS: str = "1000"

CHI_EVENT_IDS = {YEAR_14: "MAR_999999107FA3090000000065",
                 YEAR_15: "MAR_999999107FA3090000000079",
                 YEAR_16: "MAR_999999107FA309000000008D",
                 YEAR_17: "MAR_999999107FA30900000000A1",
                 YEAR_18: "MAR_999999107FA30900000000B5",
                 YEAR_19: "MAR_999999107FA31100000000C9",
                 YEAR_21: "MAR_9TGG9638F1"}

CHI_RES_FIELDS: list[str] = ["run_no", "gender", "age_cat", "finish", "idp"]
CHI_SPLITS_FIELDS: list[str] = ["idp", "race_state", "last_split", "k_5", "k_10", "k_15", "k_20", "k_half", "k_25", "k_30", "k_35", "k_40", "k_finish"]


In [61]:
chicago = ChicagoMarathon(url_template=CHICAGO_MARATHON_URL_BF_22, split_url_template=CHICAGO_MARATHON_SPLIT_URL_BF_22)

### 2014

In [62]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_14)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_14}"

#### Results Pages

In [63]:
# Getting the URLs of the result pages and the settings that will be passed to the spider.
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_14, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 28 || Women Pages: 23
Chicago 2014 total results pages: 51
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2014&page=1&sex=M&num_results=1000&event=MAR_999999107FA3090000000065 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2014&page=1&sex=W&num_results=1000&event=MAR_999999107FA3090000000065
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2014/Chicago2014_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [64]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: c2e70066dafee2af


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (50216 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2014/Chicago2014_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 35511,
 'downloader/request_count': 102,
 'downloader/request_method_count/GET': 102,
 'downloader/response_bytes': 3320480,
 'downloader/response_count': 102,
 'downloader/response_status_count/200': 51,
 'downloader/response_status_count/301': 51,
 'elapsed_time_seconds': 29.262039,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 38, 12, 398687),
 'httpcompression/response_bytes': 65821352,
 'httpcompression/response_count': 51,
 'item_scraped_count': 50216,
 'log_count/INFO': 12,
 'memusage/max': 867778560,
 'memusage/startup': 867778560,
 'response_received_count': 51,
 'scheduler/dequeued': 102,
 'scheduler/dequeued/memory': 102,
 'scheduler/e

In [10]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2014_res.csv")

In [11]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,6463,M,25-29,03:29:27,999999107FA3090000137CA8
1,861,M,45-49,03:08:58,999999107FA309000013A53E
2,42155,M,45-49,05:08:48,999999107FA309000013B78A
3,1420,M,35-39,02:53:37,999999107FA3090000131440
4,8162,M,35-39,,999999107FA3090000139ECF


In [12]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50216 entries, 0 to 50215
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   50216 non-null  int64 
 1   gender   50216 non-null  object
 2   age_cat  50216 non-null  object
 3   finish   40854 non-null  object
 4   idp      50216 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB


#### Splits Pages

In [65]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2014_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_14, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2014 total splits pages: 50216
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA3090000000065&idp=999999107FA3090000137CA8&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA3090000000065&idp=999999107FA3090000133E20&lang=EN_CAP&lang=EN_CAP
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2014/Chicago2014_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 920669b9fb1458b6
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (100 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2014/Chicago2014_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 36700,
 'downloader/request_count': 100,
 'downloader/request_method_count/GET': 100,
 'downloader/response_bytes': 786102,
 'downloader/response_count': 100,
 'downloader/response_status_count/200': 100,
 'elapsed_time_seconds': 1.593524,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 27, 13, 38, 26, 757208),
 'httpcompression/response_bytes': 2843058,
 'httpcompression/response_count': 100,
 'item_scraped_count': 100,
 'log_count/INFO': 12,
 'memusage/max': 867778560,
 'memusage/startup': 867778560,
 'response_received_count': 100,
 'scheduler/dequeued': 100,
 'scheduler/dequeued/memory': 100,
 'scheduler/enqueued': 100,
 'scheduler/enqueued/memory': 1

In [11]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2014_splits.csv")

In [12]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50216 entries, 0 to 50215
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         50216 non-null  object
 1   race_state  40849 non-null  object
 2   last_split  40849 non-null  object
 3   k_5         50216 non-null  object
 4   k_10        50216 non-null  object
 5   k_15        50216 non-null  object
 6   k_20        50216 non-null  object
 7   k_half      50216 non-null  object
 8   k_25        50216 non-null  object
 9   k_30        50216 non-null  object
 10  k_35        50216 non-null  object
 11  k_40        50216 non-null  object
 12  k_finish    50216 non-null  object
dtypes: object(13)
memory usage: 5.0+ MB


#### Full Raw Dataset for Chicago 2014

In [13]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2014_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2014_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [14]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50216 entries, 0 to 50215
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      50216 non-null  int64 
 1   gender      50216 non-null  object
 2   age_cat     50216 non-null  object
 3   finish      40854 non-null  object
 4   idp         50216 non-null  object
 5   race_state  40849 non-null  object
 6   last_split  40849 non-null  object
 7   k_5         50216 non-null  object
 8   k_10        50216 non-null  object
 9   k_15        50216 non-null  object
 10  k_20        50216 non-null  object
 11  k_half      50216 non-null  object
 12  k_25        50216 non-null  object
 13  k_30        50216 non-null  object
 14  k_35        50216 non-null  object
 15  k_40        50216 non-null  object
 16  k_finish    50216 non-null  object
dtypes: int64(1), object(16)
memory usage: 6.5+ MB


In [15]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [16]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50216 entries, 0 to 50215
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          50216 non-null  int64 
 1   gender          50216 non-null  object
 2   age_cat         50216 non-null  object
 3   finish          40854 non-null  object
 4   idp             50216 non-null  object
 5   race_state      40849 non-null  object
 6   last_split      40849 non-null  object
 7   k_5_time        50216 non-null  object
 8   k_5_pace        50216 non-null  object
 9   k_5_speed       50216 non-null  object
 10  k_10_time       50216 non-null  object
 11  k_10_pace       50216 non-null  object
 12  k_10_speed      50216 non-null  object
 13  k_15_time       50216 non-null  object
 14  k_15_pace       50216 non-null  object
 15  k_15_speed      50216 non-null  object
 16  k_20_time       50216 non-null  object
 17  k_20_pace       50216 non-null  object
 18  k_20_s

In [17]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,30-34,02:05:51,999999107FA309000013ADE9,Finished,Finish,00:14:44,02:57,20.36,...,19.91,01:43:34,02:58,20.29,01:58:58,03:05,19.48,02:05:51,03:08,19.13
1,2,M,25-29,02:04:11,999999107FA309000013ADEF,Finished,Finish,00:14:44,02:57,20.36,...,19.91,01:43:22,02:56,20.55,01:57:53,02:55,20.67,02:04:11,02:53,20.90
2,3,M,25-29,,999999107FA309000013ADFC,,,00:14:44,02:57,20.36,...,19.91,-,-,-,-,-,-,-,-,-
3,4,M,20-24,,999999107FA309000013ADF4,,,00:14:45,02:57,20.34,...,19.89,01:45:26,03:20,18.05,-,-,-,-,-,-
4,5,M,25-29,02:08:30,999999107FA309000013ADF1,Finished,Finish,00:14:43,02:57,20.39,...,19.89,01:43:32,02:58,20.34,02:00:11,03:20,18.02,02:08:30,03:48,15.84


In [18]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2014_full.csv", index=False)

In [19]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2015

In [21]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_15)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_15}"

#### Results Pages

In [24]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_15, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 25 || Women Pages: 22
Chicago 2015 total results pages: 47
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2015&page=1&sex=M&num_results=1000&event=MAR_999999107FA3090000000079 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2015&page=1&sex=W&num_results=1000&event=MAR_999999107FA3090000000079
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2015/Chicago2015_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 4a5089b48ccccd5d


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (46032 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2015/Chicago2015_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 32723,
 'downloader/request_count': 94,
 'downloader/request_method_count/GET': 94,
 'downloader/response_bytes': 3066693,
 'downloader/response_count': 94,
 'downloader/response_status_count/200': 47,
 'downloader/response_status_count/301': 47,
 'elapsed_time_seconds': 12.732417,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 21, 15, 21, 169089),
 'httpcompression/response_bytes': 60323038,
 'httpcompression/response_count': 47,
 'item_scraped_count': 46032,
 'log_count/INFO': 12,
 'memusage/max': 438501376,
 'memusage/startup': 438501376,
 'response_received_count': 47,
 'scheduler/dequeued': 94,
 'scheduler/dequeued/memory': 94,
 'scheduler/enqueu

In [26]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2015_res.csv")

In [27]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,15401,M,30-34,04:42:17,999999107FA30900001743E9
1,26071,M,30-34,,999999107FA309000017337A
2,23574,M,30-34,04:28:24,999999107FA309000016E790
3,51087,M,45-49,05:21:25,999999107FA3090000173BDB
4,17780,M,45-49,06:43:25,999999107FA309000016D7CF


In [28]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46032 entries, 0 to 46031
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   46032 non-null  int64 
 1   gender   46032 non-null  object
 2   age_cat  46032 non-null  object
 3   finish   37704 non-null  object
 4   idp      46032 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.8+ MB


#### Splits Pages

In [29]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2015_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_15, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2015 total splits pages: 46032
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA3090000000079&idp=999999107FA30900001743E9&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA3090000000079&idp=999999107FA30900001706F6&lang=EN_CAP&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2015/Chicago2015_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: da845d254acf33ac


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8434 pages (at 8434 pages/min), scraped 8423 items (at 8423 items/min)
INFO: Crawled 16898 pages (at 8464 pages/min), scraped 16882 items (at 8459 items/min)
INFO: Crawled 25434 pages (at 8536 pages/min), scraped 25411 items (at 8529 items/min)
INFO: Crawled 34093 pages (at 8659 pages/min), scraped 34075 items (at 8664 items/min)
INFO: Crawled 42720 pages (at 8627 pages/min), scraped 42693 items (at 8618 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (46032 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2015/Chicago2015_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 16893744,
 'downloader/request_count': 46032,
 'downloader/request_method_count/GET': 46032,
 'downloader/response_bytes': 362884857,
 'downloader/response_count': 46032,
 'downloader/response_status_count/200': 46032,
 'elapsed_time_seconds': 322.817876,
 'feedexport/success_count/FileFeedStorage': 

In [31]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2015_splits.csv")

In [32]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46032 entries, 0 to 46031
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         46032 non-null  object
 1   race_state  37642 non-null  object
 2   last_split  37642 non-null  object
 3   k_5         46032 non-null  object
 4   k_10        46032 non-null  object
 5   k_15        46032 non-null  object
 6   k_20        46032 non-null  object
 7   k_half      46032 non-null  object
 8   k_25        46032 non-null  object
 9   k_30        46032 non-null  object
 10  k_35        46032 non-null  object
 11  k_40        46032 non-null  object
 12  k_finish    46032 non-null  object
dtypes: object(13)
memory usage: 4.6+ MB


#### Full Raw Dataset for Chicago 2015

In [35]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2015_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2015_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [36]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46032 entries, 0 to 46031
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      46032 non-null  int64 
 1   gender      46032 non-null  object
 2   age_cat     46032 non-null  object
 3   finish      37704 non-null  object
 4   idp         46032 non-null  object
 5   race_state  37642 non-null  object
 6   last_split  37642 non-null  object
 7   k_5         46032 non-null  object
 8   k_10        46032 non-null  object
 9   k_15        46032 non-null  object
 10  k_20        46032 non-null  object
 11  k_half      46032 non-null  object
 12  k_25        46032 non-null  object
 13  k_30        46032 non-null  object
 14  k_35        46032 non-null  object
 15  k_40        46032 non-null  object
 16  k_finish    46032 non-null  object
dtypes: int64(1), object(16)
memory usage: 6.0+ MB


In [37]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [38]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46032 entries, 0 to 46031
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          46032 non-null  int64 
 1   gender          46032 non-null  object
 2   age_cat         46032 non-null  object
 3   finish          37704 non-null  object
 4   idp             46032 non-null  object
 5   race_state      37642 non-null  object
 6   last_split      37642 non-null  object
 7   k_5_time        46032 non-null  object
 8   k_5_pace        46032 non-null  object
 9   k_5_speed       46032 non-null  object
 10  k_10_time       46032 non-null  object
 11  k_10_pace       46032 non-null  object
 12  k_10_speed      46032 non-null  object
 13  k_15_time       46032 non-null  object
 14  k_15_pace       46032 non-null  object
 15  k_15_speed      46032 non-null  object
 16  k_20_time       46032 non-null  object
 17  k_20_pace       46032 non-null  object
 18  k_20_s

In [39]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,2,M,25-29,02:09:50,999999107FA30900001756C7,Finished,Finish,00:15:31,03:07,19.33,...,18.95,01:47:52,02:56,20.50,02:03:02,03:02,19.78,02:09:50,03:06,19.37
1,3,M,25-29,02:09:25,999999107FA30900001756C8,Finished,Finish,00:15:31,03:07,19.33,...,18.91,01:47:52,02:56,20.55,02:02:43,02:59,20.20,02:09:25,03:04,19.66
2,4,M,25-29,,999999107FA30900001756B9,,,-,'00:00',-,...,-,-,'00:00',-,-,'00:00',-,-,'00:00',-
3,5,M,25-29,,999999107FA30900001756BA,,,-,'00:00',-,...,-,-,'00:00',-,-,'00:00',-,-,'00:00',-
4,6,M,25-29,02:10:07,000017107FA3090000175B02,Finished,Finish,00:15:33,03:07,19.29,...,18.93,01:48:07,02:59,20.18,02:03:11,03:01,19.91,02:10:07,03:10,19.00


In [40]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2015_full.csv", index=False)

In [41]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2016

In [45]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_16)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_16}"

#### Results Pages

In [46]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_16, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 27 || Women Pages: 23
Chicago 2016 total results pages: 50
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2016&page=1&sex=M&num_results=1000&event=MAR_999999107FA309000000008D 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2016&page=1&sex=W&num_results=1000&event=MAR_999999107FA309000000008D
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2016/Chicago2016_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [47]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 6b8a8415915da4ed


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (49067 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2016/Chicago2016_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 34814,
 'downloader/request_count': 100,
 'downloader/request_method_count/GET': 100,
 'downloader/response_bytes': 3259656,
 'downloader/response_count': 100,
 'downloader/response_status_count/200': 50,
 'downloader/response_status_count/301': 50,
 'elapsed_time_seconds': 16.590626,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 21, 40, 35, 710231),
 'httpcompression/response_bytes': 64254976,
 'httpcompression/response_count': 50,
 'item_scraped_count': 49067,
 'log_count/INFO': 12,
 'memusage/max': 652902400,
 'memusage/startup': 652902400,
 'response_received_count': 50,
 'scheduler/dequeued': 100,
 'scheduler/dequeued/memory': 100,
 'scheduler/e

In [48]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2016_res.csv")

In [49]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,22734,M,35-39,04:49:36,999999107FA3090000198CFC
1,12294,M,35-39,03:13:49,999999107FA3090000198BFB
2,31988,M,40-44,04:22:24,999999107FA309000019C753
3,24948,M,40-44,03:51:50,999999107FA30900001A0D88
4,10218,M,30-34,,999999107FA30900001A301E


In [50]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49067 entries, 0 to 49066
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   49067 non-null  int64 
 1   gender   49067 non-null  object
 2   age_cat  49066 non-null  object
 3   finish   40730 non-null  object
 4   idp      49067 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB


#### Splits Pages

In [52]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2016_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_16, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2016 total splits pages: 49067
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA309000000008D&idp=999999107FA3090000198CFC&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA309000000008D&idp=999999107FA309000019BD1C&lang=EN_CAP&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2016/Chicago2016_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 285cc684aeea43f7
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 7721 pages (at 7721 pages/min), scraped 7705 items (at 7705 items/min)
INFO: Crawled 15557 pages (at 7836 pages/min), scraped 15532 items (at 7827 items/min)
INFO: Crawled 23368 pages (at 7811 pages/min), scraped 23349 items (at 7817 items/min)
INFO: Crawled 31333 pages (at 7965 pages/min), scraped 31318 items (at 7969 items/min)
INFO: Crawled 39217 pages (at 7884 pages/min), scraped 39201 items (at 7883 items/min)
INFO: Crawled 47089 pages (at 7872 pages/min), scraped 47074 items (at 7873 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (49067 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2016/Chicago2016_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 18007589,
 'downloader/request_count': 49067,
 'downloader/request_method_count/GET': 49067,
 'downloader/response_bytes': 386623850,
 'downloader/response_count': 49067,
 'downloader/response_status_count/200': 49

In [54]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2016_splits.csv")

In [55]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49067 entries, 0 to 49066
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         49067 non-null  object
 1   race_state  40727 non-null  object
 2   last_split  40727 non-null  object
 3   k_5         49067 non-null  object
 4   k_10        49067 non-null  object
 5   k_15        49067 non-null  object
 6   k_20        49067 non-null  object
 7   k_half      49067 non-null  object
 8   k_25        49067 non-null  object
 9   k_30        49067 non-null  object
 10  k_35        49067 non-null  object
 11  k_40        49067 non-null  object
 12  k_finish    49067 non-null  object
dtypes: object(13)
memory usage: 4.9+ MB


#### Full Raw Dataset for Chicago 2016

In [56]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2016_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2016_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [57]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49067 entries, 0 to 49066
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      49067 non-null  int64 
 1   gender      49067 non-null  object
 2   age_cat     49066 non-null  object
 3   finish      40730 non-null  object
 4   idp         49067 non-null  object
 5   race_state  40727 non-null  object
 6   last_split  40727 non-null  object
 7   k_5         49067 non-null  object
 8   k_10        49067 non-null  object
 9   k_15        49067 non-null  object
 10  k_20        49067 non-null  object
 11  k_half      49067 non-null  object
 12  k_25        49067 non-null  object
 13  k_30        49067 non-null  object
 14  k_35        49067 non-null  object
 15  k_40        49067 non-null  object
 16  k_finish    49067 non-null  object
dtypes: int64(1), object(16)
memory usage: 6.4+ MB


In [58]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [59]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49067 entries, 0 to 49066
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          49067 non-null  int64 
 1   gender          49067 non-null  object
 2   age_cat         49066 non-null  object
 3   finish          40730 non-null  object
 4   idp             49067 non-null  object
 5   race_state      40727 non-null  object
 6   last_split      40727 non-null  object
 7   k_5_time        49067 non-null  object
 8   k_5_pace        49067 non-null  object
 9   k_5_speed       49067 non-null  object
 10  k_10_time       49067 non-null  object
 11  k_10_pace       49067 non-null  object
 12  k_10_speed      49067 non-null  object
 13  k_15_time       49067 non-null  object
 14  k_15_pace       49067 non-null  object
 15  k_15_speed      49067 non-null  object
 16  k_20_time       49067 non-null  object
 17  k_20_pace       49067 non-null  object
 18  k_20_s

In [60]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,25-29,02:11:26,999999107FA30900001A5A1D,Finished,Finish,00:16:07,03:14,18.61,...,18.48,01:50:11,03:03,19.72,02:04:56,02:58,20.34,02:11:26,02:58,20.26
1,2,M,19 and under,,999999107FA30900001A58EE,,,-,'00:00',-,...,-,-,'00:00',-,-,'00:00',-,-,'00:00',-
2,3,M,30-34,02:11:23,999999107FA30900001A58FC,Finished,Finish,00:16:07,03:14,18.61,...,18.50,01:50:11,03:03,19.69,02:04:56,02:58,20.34,02:11:23,02:57,20.42
3,4,M,25-29,02:13:52,999999107FA30900001A58EF,Finished,Finish,00:16:08,03:14,18.59,...,18.48,01:50:35,03:08,19.21,02:06:41,03:14,18.63,02:13:52,03:17,18.33
4,5,M,30-34,02:20:03,999999107FA30900001A58FD,Finished,Finish,00:16:06,03:14,18.63,...,18.58,01:51:43,03:22,17.91,02:10:39,03:48,15.85,02:20:03,04:18,14.01


In [61]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2016_full.csv", index=False)

In [62]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2017

In [63]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_17)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_17}"

#### Results Pages

In [64]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_17, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 28 || Women Pages: 27
Chicago 2017 total results pages: 55
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2017&page=1&sex=M&num_results=1000&event=MAR_999999107FA30900000000A1 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2017&page=1&sex=W&num_results=1000&event=MAR_999999107FA30900000000A1
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2017/Chicago2017_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: a21710f554675a8e


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (54326 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2017/Chicago2017_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 38299,
 'downloader/request_count': 110,
 'downloader/request_method_count/GET': 110,
 'downloader/response_bytes': 3572782,
 'downloader/response_count': 110,
 'downloader/response_status_count/200': 55,
 'downloader/response_status_count/301': 55,
 'elapsed_time_seconds': 14.926603,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 21, 59, 44, 316708),
 'httpcompression/response_bytes': 71189649,
 'httpcompression/response_count': 55,
 'item_scraped_count': 54326,
 'log_count/INFO': 12,
 'memusage/max': 699498496,
 'memusage/startup': 699498496,
 'response_received_count': 55,
 'scheduler/dequeued': 110,
 'scheduler/dequeued/memory': 110,
 'scheduler/e

In [66]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2017_res.csv")

In [67]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,48552,M,40-44,05:27:11,999999107FA30900001C06E3
1,22781,M,55-59,04:51:43,999999107FA30900001C01DC
2,5653,M,35-39,03:48:14,999999107FA30900001CA70E
3,818,M,30-34,04:48:52,999999107FA30900001C1F4C
4,6984,M,25-29,05:48:59,999999107FA30900001C873B


In [68]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54326 entries, 0 to 54325
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   54326 non-null  int64 
 1   gender   54326 non-null  object
 2   age_cat  54326 non-null  object
 3   finish   44535 non-null  object
 4   idp      54326 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


#### Splits Pages

In [69]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2017_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_17, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2017 total splits pages: 54326
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA30900000000A1&idp=999999107FA30900001C06E3&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA30900000000A1&idp=999999107FA30900001C3775&lang=EN_CAP&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2017/Chicago2017_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: e30a91ef030b479f
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8286 pages (at 8286 pages/min), scraped 8268 items (at 8268 items/min)
INFO: Crawled 16619 pages (at 8333 pages/min), scraped 16603 items (at 8335 items/min)
INFO: Crawled 24782 pages (at 8163 pages/min), scraped 24767 items (at 8164 items/min)
INFO: Crawled 33070 pages (at 8288 pages/min), scraped 33054 items (at 8287 items/min)
INFO: Crawled 41459 pages (at 8389 pages/min), scraped 41446 items (at 8392 items/min)
INFO: Crawled 49878 pages (at 8419 pages/min), scraped 49869 items (at 8423 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (54326 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2017/Chicago2017_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 19937642,
 'downloader/request_count': 54326,
 'downloader/request_method_count/GET': 54326,
 'downloader/response_bytes': 428088497,
 'downloader/response_count': 54326,
 'downloader/response_status_count/200': 54

In [71]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2017_splits.csv")

In [72]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54326 entries, 0 to 54325
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         54326 non-null  object
 1   race_state  44532 non-null  object
 2   last_split  44532 non-null  object
 3   k_5         54326 non-null  object
 4   k_10        54326 non-null  object
 5   k_15        54326 non-null  object
 6   k_20        54326 non-null  object
 7   k_half      54326 non-null  object
 8   k_25        54326 non-null  object
 9   k_30        54326 non-null  object
 10  k_35        54326 non-null  object
 11  k_40        54326 non-null  object
 12  k_finish    54326 non-null  object
dtypes: object(13)
memory usage: 5.4+ MB


#### Full Raw Dataset for Chicago 2017

In [73]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2017_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2017_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [74]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54326 entries, 0 to 54325
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      54326 non-null  int64 
 1   gender      54326 non-null  object
 2   age_cat     54326 non-null  object
 3   finish      44535 non-null  object
 4   idp         54326 non-null  object
 5   race_state  44532 non-null  object
 6   last_split  44532 non-null  object
 7   k_5         54326 non-null  object
 8   k_10        54326 non-null  object
 9   k_15        54326 non-null  object
 10  k_20        54326 non-null  object
 11  k_half      54326 non-null  object
 12  k_25        54326 non-null  object
 13  k_30        54326 non-null  object
 14  k_35        54326 non-null  object
 15  k_40        54326 non-null  object
 16  k_finish    54326 non-null  object
dtypes: int64(1), object(16)
memory usage: 7.0+ MB


In [75]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [76]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54326 entries, 0 to 54325
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          54326 non-null  int64 
 1   gender          54326 non-null  object
 2   age_cat         54326 non-null  object
 3   finish          44535 non-null  object
 4   idp             54326 non-null  object
 5   race_state      44532 non-null  object
 6   last_split      44532 non-null  object
 7   k_5_time        54326 non-null  object
 8   k_5_pace        54326 non-null  object
 9   k_5_speed       54326 non-null  object
 10  k_10_time       54326 non-null  object
 11  k_10_pace       54326 non-null  object
 12  k_10_speed      54326 non-null  object
 13  k_15_time       54326 non-null  object
 14  k_15_pace       54326 non-null  object
 15  k_15_speed      54326 non-null  object
 16  k_20_time       54326 non-null  object
 17  k_20_pace       54326 non-null  object
 18  k_20_s

In [77]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,35-39,02:09:48,999999107FA30900001CF72A,Finished,Finish,00:15:43,03:09,19.09,...,19.87,01:48:43,03:05,19.52,02:03:22,02:56,20.48,02:09:48,02:56,20.47
1,2,M,30-34,,999999107FA30900001CF72B,,,00:15:43,03:09,19.09,...,-,-,-,-,-,-,-,-,-,-
2,3,M,30-34,,999999107FA30900001CF72C,,,00:15:44,03:09,19.07,...,-,-,-,-,-,-,-,-,-,-
3,4,M,25-29,02:14:49,999999107FA30900001CF72D,Finished,Finish,00:15:46,03:10,19.03,...,19.89,01:48:57,03:08,19.25,02:06:26,03:30,17.16,02:14:49,03:50,15.71
4,5,M,25-29,02:11:01,999999107FA30900001CF72E,Finished,Finish,00:15:46,03:10,19.03,...,19.87,01:48:43,03:05,19.54,02:03:47,03:01,19.91,02:11:01,03:18,18.21


In [78]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2017_full.csv", index=False)

In [79]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2018

In [81]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_18)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_18}"

#### Results Pages

In [82]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_18, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 30 || Women Pages: 26
Chicago 2018 total results pages: 56
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2018&page=1&sex=M&num_results=1000&event=MAR_999999107FA30900000000B5 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2018&page=1&sex=W&num_results=1000&event=MAR_999999107FA30900000000B5
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2018/Chicago2018_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [83]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 5814d970e5a8a900


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (55621 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2018/Chicago2018_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 38996,
 'downloader/request_count': 112,
 'downloader/request_method_count/GET': 112,
 'downloader/response_bytes': 3680668,
 'downloader/response_count': 112,
 'downloader/response_status_count/200': 56,
 'downloader/response_status_count/301': 56,
 'elapsed_time_seconds': 17.537351,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 22, 12, 40, 128603),
 'httpcompression/response_bytes': 72972908,
 'httpcompression/response_count': 56,
 'item_scraped_count': 55621,
 'log_count/INFO': 12,
 'memusage/max': 699498496,
 'memusage/startup': 699498496,
 'response_received_count': 56,
 'scheduler/dequeued': 112,
 'scheduler/dequeued/memory': 112,
 'scheduler/e

In [84]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2018_res.csv")

In [85]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,7215,M,45-49,,999999107FA30E000020E910
1,4084,M,35-39,02:49:08,999999107FA30E000020E21A
2,49410,M,55-59,06:30:59,999999107FA30E0000216EC9
3,17104,M,40-44,03:28:18,999999107FA30E00002102D2
4,3953,M,30-34,04:12:44,999999107FA30E000020E27A


In [86]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55621 entries, 0 to 55620
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   55621 non-null  int64 
 1   gender   55621 non-null  object
 2   age_cat  55621 non-null  object
 3   finish   44626 non-null  object
 4   idp      55621 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


#### Splits Pages

In [87]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2018_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_18, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2018 total splits pages: 55621
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA30900000000B5&idp=999999107FA30E000020E910&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA30900000000B5&idp=999999107FA30E0000215A39&lang=EN_CAP&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2018/Chicago2018_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 1283fd71e84729d7
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8340 pages (at 8340 pages/min), scraped 8325 items (at 8325 items/min)
INFO: Crawled 16755 pages (at 8415 pages/min), scraped 16737 items (at 8412 items/min)
INFO: Crawled 24838 pages (at 8083 pages/min), scraped 24822 items (at 8085 items/min)
INFO: Crawled 33047 pages (at 8209 pages/min), scraped 33030 items (at 8208 items/min)
INFO: Crawled 41409 pages (at 8362 pages/min), scraped 41393 items (at 8363 items/min)
INFO: Crawled 49715 pages (at 8306 pages/min), scraped 49691 items (at 8298 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (55621 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2018/Chicago2018_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 20412907,
 'downloader/request_count': 55621,
 'downloader/request_method_count/GET': 55621,
 'downloader/response_bytes': 437729637,
 'downloader/response_count': 55621,
 'downloader/response_status_count/200': 55

In [89]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2018_splits.csv")

In [90]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55621 entries, 0 to 55620
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         55621 non-null  object
 1   race_state  44626 non-null  object
 2   last_split  44626 non-null  object
 3   k_5         55621 non-null  object
 4   k_10        55621 non-null  object
 5   k_15        55621 non-null  object
 6   k_20        55621 non-null  object
 7   k_half      55621 non-null  object
 8   k_25        55621 non-null  object
 9   k_30        55621 non-null  object
 10  k_35        55621 non-null  object
 11  k_40        55621 non-null  object
 12  k_finish    55621 non-null  object
dtypes: object(13)
memory usage: 5.5+ MB


#### Full Raw Dataset for Chicago 2018

In [91]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2018_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2018_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [92]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55621 entries, 0 to 55620
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      55621 non-null  int64 
 1   gender      55621 non-null  object
 2   age_cat     55621 non-null  object
 3   finish      44626 non-null  object
 4   idp         55621 non-null  object
 5   race_state  44626 non-null  object
 6   last_split  44626 non-null  object
 7   k_5         55621 non-null  object
 8   k_10        55621 non-null  object
 9   k_15        55621 non-null  object
 10  k_20        55621 non-null  object
 11  k_half      55621 non-null  object
 12  k_25        55621 non-null  object
 13  k_30        55621 non-null  object
 14  k_35        55621 non-null  object
 15  k_40        55621 non-null  object
 16  k_finish    55621 non-null  object
dtypes: int64(1), object(16)
memory usage: 7.2+ MB


In [93]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [94]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55621 entries, 0 to 55620
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          55621 non-null  int64 
 1   gender          55621 non-null  object
 2   age_cat         55621 non-null  object
 3   finish          44626 non-null  object
 4   idp             55621 non-null  object
 5   race_state      44626 non-null  object
 6   last_split      44626 non-null  object
 7   k_5_time        55621 non-null  object
 8   k_5_pace        55621 non-null  object
 9   k_5_speed       55621 non-null  object
 10  k_10_time       55621 non-null  object
 11  k_10_pace       55621 non-null  object
 12  k_10_speed      55621 non-null  object
 13  k_15_time       55621 non-null  object
 14  k_15_pace       55621 non-null  object
 15  k_15_speed      55621 non-null  object
 16  k_20_time       55621 non-null  object
 17  k_20_pace       55621 non-null  object
 18  k_20_s

In [95]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,30-34,02:06:21,999999107FA310000021D076,Finished,Finish,00:14:52,02:59,20.18,...,20.76,01:44:19,02:55,20.59,01:59:22,03:01,19.93,02:06:21,03:11,18.86
1,2,M,35-39,02:07:52,999999107FA310000021D077,Finished,Finish,00:14:52,02:59,20.18,...,20.74,01:44:44,03:00,20.04,02:00:39,03:11,18.85,02:07:52,03:18,18.25
2,3,M,30-34,02:16:12,999999107FA310000021D078,Finished,Finish,00:14:53,02:59,20.16,...,18.59,01:48:57,03:25,17.65,02:07:09,03:39,16.48,02:16:12,04:08,14.55
3,4,M,25-29,02:05:24,999999107FA310000021D079,Finished,Finish,00:14:53,02:59,20.16,...,20.79,01:44:16,02:55,20.64,01:58:46,02:55,20.69,02:05:24,03:01,19.85
4,5,M,20-24,02:08:41,999999107FA310000021D07A,Finished,Finish,00:14:53,02:59,20.16,...,20.81,01:44:21,02:56,20.52,02:00:33,03:15,18.52,02:08:41,03:43,16.19


In [96]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2018_full.csv", index=False)

In [97]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2019

In [99]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_19)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_19}"

#### Results Pages

In [100]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_19, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 30 || Women Pages: 26
Chicago 2019 total results pages: 56
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2019&page=1&sex=M&num_results=1000&event=MAR_999999107FA31100000000C9 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2019&page=1&sex=W&num_results=1000&event=MAR_999999107FA31100000000C9
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2019/Chicago2019_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 7a7565b9a442027c


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (55395 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2019/Chicago2019_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 38996,
 'downloader/request_count': 112,
 'downloader/request_method_count/GET': 112,
 'downloader/response_bytes': 3688530,
 'downloader/response_count': 112,
 'downloader/response_status_count/200': 56,
 'downloader/response_status_count/301': 56,
 'elapsed_time_seconds': 16.351126,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 22, 28, 34, 249896),
 'httpcompression/response_bytes': 72572271,
 'httpcompression/response_count': 56,
 'item_scraped_count': 55395,
 'log_count/INFO': 12,
 'memusage/max': 699498496,
 'memusage/startup': 699498496,
 'response_received_count': 56,
 'scheduler/dequeued': 112,
 'scheduler/dequeued/memory': 112,
 'scheduler/e

In [102]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2019_res.csv")

In [103]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,34974,M,40-44,,999999107FA31200002325C4
1,26400,M,45-49,04:02:34,999999107FA3120000230065
2,47670,M,45-49,04:46:51,999999107FA3120000234009
3,1281,M,35-39,04:00:53,999999107FA312000022AEA4
4,37616,M,30-34,04:11:29,999999107FA3120000231779


In [104]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55395 entries, 0 to 55394
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   55395 non-null  int64 
 1   gender   55395 non-null  object
 2   age_cat  55394 non-null  object
 3   finish   45868 non-null  object
 4   idp      55395 non-null  object
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


#### Splits Pages

In [105]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2019_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_19, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2019 total splits pages: 55395
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA31100000000C9&idp=999999107FA31200002325C4&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_999999107FA31100000000C9&idp=999999107FA3120000238316&lang=EN_CAP&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2019/Chicago2019_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: e431e744b3eeb756


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 7911 pages (at 7911 pages/min), scraped 7902 items (at 7902 items/min)
INFO: Crawled 16102 pages (at 8191 pages/min), scraped 16086 items (at 8184 items/min)
INFO: Crawled 24356 pages (at 8254 pages/min), scraped 24339 items (at 8253 items/min)
INFO: Crawled 32343 pages (at 7987 pages/min), scraped 32331 items (at 7992 items/min)
INFO: Crawled 40716 pages (at 8373 pages/min), scraped 40703 items (at 8372 items/min)
INFO: Crawled 48819 pages (at 8103 pages/min), scraped 48803 items (at 8100 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (55395 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2019/Chicago2019_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 20329965,
 'downloader/request_count': 55395,
 'downloader/request_method_count/GET': 55395,
 'downloader/response_bytes': 436276432,
 'downloader/response_count': 55395,
 'downloader/response_status_count/200': 55

In [107]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2019_splits.csv")

In [108]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55395 entries, 0 to 55394
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         55395 non-null  object
 1   race_state  45868 non-null  object
 2   last_split  45868 non-null  object
 3   k_5         55395 non-null  object
 4   k_10        55395 non-null  object
 5   k_15        55395 non-null  object
 6   k_20        55395 non-null  object
 7   k_half      55395 non-null  object
 8   k_25        55395 non-null  object
 9   k_30        55395 non-null  object
 10  k_35        55395 non-null  object
 11  k_40        55395 non-null  object
 12  k_finish    55395 non-null  object
dtypes: object(13)
memory usage: 5.5+ MB


#### Full Raw Dataset for Chicago 2019

In [109]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2019_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2019_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [110]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55395 entries, 0 to 55394
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      55395 non-null  int64 
 1   gender      55395 non-null  object
 2   age_cat     55394 non-null  object
 3   finish      45868 non-null  object
 4   idp         55395 non-null  object
 5   race_state  45868 non-null  object
 6   last_split  45868 non-null  object
 7   k_5         55395 non-null  object
 8   k_10        55395 non-null  object
 9   k_15        55395 non-null  object
 10  k_20        55395 non-null  object
 11  k_half      55395 non-null  object
 12  k_25        55395 non-null  object
 13  k_30        55395 non-null  object
 14  k_35        55395 non-null  object
 15  k_40        55395 non-null  object
 16  k_finish    55395 non-null  object
dtypes: int64(1), object(16)
memory usage: 7.2+ MB


In [111]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [112]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55395 entries, 0 to 55394
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          55395 non-null  int64 
 1   gender          55395 non-null  object
 2   age_cat         55394 non-null  object
 3   finish          45868 non-null  object
 4   idp             55395 non-null  object
 5   race_state      45868 non-null  object
 6   last_split      45868 non-null  object
 7   k_5_time        55395 non-null  object
 8   k_5_pace        55395 non-null  object
 9   k_5_speed       55395 non-null  object
 10  k_10_time       55395 non-null  object
 11  k_10_pace       55395 non-null  object
 12  k_10_speed      55395 non-null  object
 13  k_15_time       55395 non-null  object
 14  k_15_pace       55395 non-null  object
 15  k_15_speed      55395 non-null  object
 16  k_20_time       55395 non-null  object
 17  k_20_pace       55395 non-null  object
 18  k_20_s

In [113]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,35-39,02:09:58,999999107FA317000023A481,Finished,Finish,00:14:47,02:58,20.29,...,19.23,01:46:23,03:11,18.89,02:02:28,03:14,18.65,02:09:58,03:25,17.56
1,4,M,30-34,02:05:45,999999107FA317000023A4AD,Finished,Finish,00:14:45,02:57,20.34,...,19.96,01:43:54,02:59,20.11,01:59:10,03:04,19.65,02:05:45,03:01,20.01
2,5,M,30-34,02:05:48,999999107FA317000023A482,Finished,Finish,00:14:46,02:58,20.32,...,19.91,01:43:53,02:59,20.13,01:59:08,03:03,19.67,02:05:48,03:03,19.75
3,6,M,30-34,02:09:11,999999107FA317000023A483,Finished,Finish,00:14:45,02:57,20.34,...,19.93,01:44:27,03:06,19.38,02:01:05,03:20,18.04,02:09:11,03:42,16.26
4,8,M,30-34,,999999107FA317000023A484,,,00:14:49,02:58,20.25,...,19.33,01:46:45,03:15,18.56,-,'00:00',-,-,'00:00',-


In [114]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2019_full.csv", index=False)

In [115]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2021

In [116]:
chicago.event_id = CHI_EVENT_IDS.get(YEAR_21)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_21}"

#### Results Pages

In [117]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_21, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 19 || Women Pages: 16
Chicago 2021 total results pages: 35
Example URLs: 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2021&page=1&sex=M&num_results=1000&event=MAR_9TGG9638F1 
 https://chicago-history.r.mikatiming.com/?pid=search&pidp=start&event_main_group=2021&page=1&sex=W&num_results=1000&event=MAR_9TGG9638F1
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2021/Chicago2021_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [118]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: ba2ee86c5825c90c


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (33502 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2021/Chicago2021_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 23379,
 'downloader/request_count': 70,
 'downloader/request_method_count/GET': 70,
 'downloader/response_bytes': 2194520,
 'downloader/response_count': 70,
 'downloader/response_status_count/200': 35,
 'downloader/response_status_count/301': 35,
 'elapsed_time_seconds': 9.683624,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 22, 41, 27, 297875),
 'httpcompression/response_bytes': 42727274,
 'httpcompression/response_count': 35,
 'item_scraped_count': 33502,
 'log_count/INFO': 12,
 'memusage/max': 699498496,
 'memusage/startup': 699498496,
 'response_received_count': 35,
 'scheduler/dequeued': 70,
 'scheduler/dequeued/memory': 70,
 'scheduler/enqueue

In [119]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2021_res.csv")

In [120]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,12199,M,20-24,03:11:31,LSMG9638245383
1,11589,M,45-49,03:50:23,LSMG96382456D8
2,43484,M,45-49,05:55:12,LSMG963824BCB7
3,30644,M,30-34,04:52:24,LSMG963824A799
4,39925,M,40-44,06:08:10,LSMG963824C7FB


In [121]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33502 entries, 0 to 33501
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   33502 non-null  int64 
 1   gender   33502 non-null  object
 2   age_cat  33502 non-null  object
 3   finish   26141 non-null  object
 4   idp      33502 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.3+ MB


#### Splits Pages

In [122]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2021_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_21, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True, use_event_id=True)

Chicago 2021 total splits pages: 33502
Example URLs: 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_9TGG9638F1&idp=LSMG9638245383&lang=EN_CAP&lang=EN_CAP 
 https://chicago-history.r.mikatiming.com/2021/?content=detail&fpid=search&pid=search&event=MAR_9TGG9638F1&idp=LSMG963824D1D1&lang=EN_CAP&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2021/Chicago2021_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: 14e24e71634806e3


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8521 pages (at 8521 pages/min), scraped 8503 items (at 8503 items/min)
INFO: Crawled 17006 pages (at 8485 pages/min), scraped 16990 items (at 8487 items/min)
INFO: Crawled 25520 pages (at 8514 pages/min), scraped 25505 items (at 8515 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (33502 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2021/Chicago2021_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 11491186,
 'downloader/request_count': 33502,
 'downloader/request_method_count/GET': 33502,
 'downloader/response_bytes': 262942544,
 'downloader/response_count': 33502,
 'downloader/response_status_count/200': 33502,
 'elapsed_time_seconds': 236.551102,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 22, 46, 16, 767505),
 'httpcompression/response_bytes': 947784023,
 'httpcompression/response_

In [124]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2021_splits.csv")

In [125]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33502 entries, 0 to 33501
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         33502 non-null  object
 1   race_state  26141 non-null  object
 2   last_split  26141 non-null  object
 3   k_5         33502 non-null  object
 4   k_10        33502 non-null  object
 5   k_15        33502 non-null  object
 6   k_20        33502 non-null  object
 7   k_half      33502 non-null  object
 8   k_25        33502 non-null  object
 9   k_30        33502 non-null  object
 10  k_35        33502 non-null  object
 11  k_40        33502 non-null  object
 12  k_finish    33502 non-null  object
dtypes: object(13)
memory usage: 3.3+ MB


#### Full Raw Dataset for Chicago 2021

In [126]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2021_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2021_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [127]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33502 entries, 0 to 33501
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      33502 non-null  int64 
 1   gender      33502 non-null  object
 2   age_cat     33502 non-null  object
 3   finish      26141 non-null  object
 4   idp         33502 non-null  object
 5   race_state  26141 non-null  object
 6   last_split  26141 non-null  object
 7   k_5         33502 non-null  object
 8   k_10        33502 non-null  object
 9   k_15        33502 non-null  object
 10  k_20        33502 non-null  object
 11  k_half      33502 non-null  object
 12  k_25        33502 non-null  object
 13  k_30        33502 non-null  object
 14  k_35        33502 non-null  object
 15  k_40        33502 non-null  object
 16  k_finish    33502 non-null  object
dtypes: int64(1), object(16)
memory usage: 4.3+ MB


In [128]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [129]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33502 entries, 0 to 33501
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          33502 non-null  int64 
 1   gender          33502 non-null  object
 2   age_cat         33502 non-null  object
 3   finish          26141 non-null  object
 4   idp             33502 non-null  object
 5   race_state      26141 non-null  object
 6   last_split      26141 non-null  object
 7   k_5_time        33502 non-null  object
 8   k_5_pace        33502 non-null  object
 9   k_5_speed       33502 non-null  object
 10  k_10_time       33502 non-null  object
 11  k_10_pace       33502 non-null  object
 12  k_10_speed      33502 non-null  object
 13  k_15_time       33502 non-null  object
 14  k_15_pace       33502 non-null  object
 15  k_15_speed      33502 non-null  object
 16  k_20_time       33502 non-null  object
 17  k_20_pace       33502 non-null  object
 18  k_20_s

In [130]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,2,M,25-29,02:14:24,LSMG963824DD5B,Finished,Finish,00:14:43,02:57,20.39,...,19.46,01:46:12,03:14,18.65,02:06:02,03:59,15.13,02:14:24,03:49,15.74
1,3,M,20-24,02:06:12,LSMG963824DD5C,Finished,Finish,00:14:43,02:57,20.39,...,19.48,01:45:01,02:59,20.11,01:59:44,02:57,20.39,02:06:12,02:58,20.37
2,4,M,30-34,,LSMG963824DD5D,,,00:14:43,02:57,20.39,...,-,-,-,-,-,-,-,-,-,-
3,5,M,25-29,02:08:50,LSMG963824DD5E,Finished,Finish,00:14:44,02:57,20.36,...,19.50,01:45:30,03:05,19.50,02:01:48,03:16,18.40,02:08:50,03:13,18.73
4,6,M,20-24,02:09:39,LSMG963824DD5F,Finished,Finish,00:14:36,02:56,20.55,...,19.46,01:45:59,03:11,18.93,02:02:16,03:16,18.42,02:09:39,03:23,17.84


In [131]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2021_full.csv", index=False)

In [132]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path

### 2022

In [7]:
chicago = ChicagoMarathon(url_template=CHICAGO_MARATHON_URL_AF_22, split_url_template=CHICAGO_MARATHON_SPLIT_URL_AF_22)
chi_data_path = f"Marathons_Data/Raw/Chicago/Chicago{YEAR_22}"

#### Results Pages

In [11]:
chi_pages_urls, chi_res_settings = chicago.gen_res_scrap_info(YEAR_22, CHI_NUM_RESULTS, CHI_RES_FIELDS, chi_data_path, show_settings=True)

Men Pages: 28 || Women Pages: 25
Chicago 2022 total results pages: 53
Example URLs: 
 https://results.chicagomarathon.com/2022/?pid=search&page=1&sex=M&num_results=1000&event=MAR 
 https://results.chicagomarathon.com/2022/?pid=search&page=1&sex=W&num_results=1000&event=MAR
Settings: 
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2022/Chicago2022_res.csv': {'format': 'csv', 'fields': ['run_no', 'gender', 'age_cat', 'finish', 'idp'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_pages_urls, settings=chi_res_settings, year=YEAR_22)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}
INFO: Telnet Password: e0fcda1505b8f39c


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Closing spider (finished)
INFO: Stored csv feed (51087 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2022/Chicago2022_res.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15246,
 'downloader/request_count': 53,
 'downloader/request_method_count/GET': 53,
 'downloader/response_bytes': 4265411,
 'downloader/response_count': 53,
 'downloader/response_status_count/200': 53,
 'elapsed_time_seconds': 15.6674,
 'feedexport/success_count/FileFeedStorage': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2023, 6, 25, 23, 12, 43, 684618),
 'httpcompression/response_bytes': 89228263,
 'httpcompression/response_count': 53,
 'item_scraped_count': 51087,
 'log_count/INFO': 12,
 'memusage/max': 244629504,
 'memusage/startup': 244629504,
 'response_received_count': 53,
 'scheduler/dequeued': 53,
 'scheduler/dequeued/memory': 53,
 'scheduler/enqueued': 53,
 'scheduler/enqueued/memory': 53,
 'st

In [14]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2022_res.csv")

In [15]:
df_chi_res.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp
0,42629,M,35-39,05:42:42,9TGG9638277361
1,14776,M,40-44,,9TGG9638270D63
2,52661,M,55-59,,9TGG963827957B
3,2073,M,35-39,,9TGG963826DD8D
4,18023,M,30-34,03:44:23,9TGG963827193F


In [16]:
df_chi_res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51087 entries, 0 to 51086
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   run_no   51087 non-null  int64 
 1   gender   51087 non-null  object
 2   age_cat  51048 non-null  object
 3   finish   39358 non-null  object
 4   idp      51087 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB


#### Splits Pages

In [8]:
df_chi_res = pd.read_csv(chi_data_path+f"/{CHI_NAME}2022_res.csv")
chi_splits_urls, chi_splits_settings = chicago.gen_splits_scrap_info(YEAR_22, df_chi_res["idp"].to_list(), CHI_SPLITS_FIELDS, 
                                                                     chi_data_path, show_settings=True)

Chicago 2022 total splits pages: 51087
Example URLs: 
 https://results.chicagomarathon.com/2022/?content=detail&fpid=search&pid=search&idp=9TGG9638277361&lang=EN_CAP 
 https://results.chicagomarathon.com/2022/?content=detail&fpid=search&pid=search&idp=9TGG963827510C&lang=EN_CAP
{'FEEDS': {'/Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2022/Chicago2022_splits.csv': {'format': 'csv', 'fields': ['idp', 'race_state', 'last_split', 'k_5', 'k_10', 'k_15', 'k_20', 'k_half', 'k_25', 'k_30', 'k_35', 'k_40', 'k_finish'], 'overwrite': 'True'}}, 'LOG_LEVEL': 'INFO'}


In [None]:
run_spider(chicago_spiders.Chicago1422, urls=chi_splits_urls, settings=chi_splits_settings, splits=True, year=YEAR_22)

INFO: Overridden settings:
{'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
INFO: Telnet Password: 1dc4dab4685b5086
INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirec

INFO: Crawled 8134 pages (at 8134 pages/min), scraped 8118 items (at 8118 items/min)
INFO: Crawled 16372 pages (at 8238 pages/min), scraped 16355 items (at 8237 items/min)
INFO: Crawled 24840 pages (at 8468 pages/min), scraped 24823 items (at 8468 items/min)
INFO: Crawled 33267 pages (at 8427 pages/min), scraped 33251 items (at 8428 items/min)
INFO: Crawled 41830 pages (at 8563 pages/min), scraped 41812 items (at 8561 items/min)
INFO: Crawled 50318 pages (at 8488 pages/min), scraped 50310 items (at 8498 items/min)
INFO: Closing spider (finished)
INFO: Stored csv feed (51087 items) in: /Users/sika/Documents/ARU MASTER /MASTER_2022_2023/Master_Project/Artefact/Marathons_Data/Raw/Chicago/Chicago2022/Chicago2022_splits.csv
INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 15581535,
 'downloader/request_count': 51087,
 'downloader/request_method_count/GET': 51087,
 'downloader/response_bytes': 437912847,
 'downloader/response_count': 51087,
 'downloader/response_status_count/200': 51

In [10]:
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2022_splits.csv")

In [11]:
df_chi_splits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51087 entries, 0 to 51086
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   idp         51087 non-null  object
 1   race_state  51087 non-null  object
 2   last_split  39963 non-null  object
 3   k_5         51087 non-null  object
 4   k_10        51087 non-null  object
 5   k_15        51087 non-null  object
 6   k_20        51087 non-null  object
 7   k_half      51087 non-null  object
 8   k_25        51087 non-null  object
 9   k_30        51087 non-null  object
 10  k_35        51087 non-null  object
 11  k_40        51087 non-null  object
 12  k_finish    51087 non-null  object
dtypes: object(13)
memory usage: 5.1+ MB


#### Full Raw Dataset for Chicago 2022

In [14]:
df_chi_res    = pd.read_csv(chi_data_path+f"/{CHI_NAME}2022_res.csv")
df_chi_splits = pd.read_csv(chi_data_path+f"/{CHI_NAME}2022_splits.csv")
df_chi_full   = pd.merge(df_chi_res, df_chi_splits, on="idp")

In [15]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51087 entries, 0 to 51086
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   run_no      51087 non-null  int64 
 1   gender      51087 non-null  object
 2   age_cat     51048 non-null  object
 3   finish      39358 non-null  object
 4   idp         51087 non-null  object
 5   race_state  51087 non-null  object
 6   last_split  39963 non-null  object
 7   k_5         51087 non-null  object
 8   k_10        51087 non-null  object
 9   k_15        51087 non-null  object
 10  k_20        51087 non-null  object
 11  k_half      51087 non-null  object
 12  k_25        51087 non-null  object
 13  k_30        51087 non-null  object
 14  k_35        51087 non-null  object
 15  k_40        51087 non-null  object
 16  k_finish    51087 non-null  object
dtypes: int64(1), object(16)
memory usage: 6.6+ MB


In [16]:
df_chi_full.sort_values(by="run_no", inplace=True, ignore_index=True)
# Expanding the splits since they contain, time(hh:mm:ss), pace(min/km) , and speed(km/h)
df_chi_full = expand_splits(df_chi_full)
# Drop the splits Lists.
df_chi_full.drop(SPLITS_KEYS, axis=1, inplace=True)

In [17]:
df_chi_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51087 entries, 0 to 51086
Data columns (total 37 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   run_no          51087 non-null  int64 
 1   gender          51087 non-null  object
 2   age_cat         51048 non-null  object
 3   finish          39358 non-null  object
 4   idp             51087 non-null  object
 5   race_state      51087 non-null  object
 6   last_split      39963 non-null  object
 7   k_5_time        51087 non-null  object
 8   k_5_pace        51087 non-null  object
 9   k_5_speed       51087 non-null  object
 10  k_10_time       51087 non-null  object
 11  k_10_pace       51087 non-null  object
 12  k_10_speed      51087 non-null  object
 13  k_15_time       51087 non-null  object
 14  k_15_pace       51087 non-null  object
 15  k_15_speed      51087 non-null  object
 16  k_20_time       51087 non-null  object
 17  k_20_pace       51087 non-null  object
 18  k_20_s

In [18]:
df_chi_full.head()

Unnamed: 0,run_no,gender,age_cat,finish,idp,race_state,last_split,k_5_time,k_5_pace,k_5_speed,...,k_30_speed,k_35_time,k_35_pace,k_35_speed,k_40_time,k_40_pace,k_40_speed,k_finish_time,k_finish_pace,k_finish_speed
0,1,M,25-29,02:04:49,9TGG963827B3CD,Finished,Finish,00:14:42,02:57,20.41,...,20.34,01:43:05,02:55,20.62,01:57:58,02:59,20.16,02:04:49,03:08,19.23
1,2,M,25-29,,9TGG963827B3CE,Started,40K,00:14:44,02:57,20.36,...,21.56,-,-,-,02:02:13,03:13,18.71,-,-,-
2,3,M,30-34,02:07:15,9TGG963827B3CF,Finished,Finish,00:14:42,02:57,20.41,...,20.36,01:43:04,02:55,20.62,01:59:08,03:13,18.67,02:07:15,03:42,16.23
3,4,M,30-34,,9TGG963827B3D0,Not Started,,-,'00:00',-,...,-,-,'00:00',-,-,'00:00',-,-,'00:00',-
4,5,M,30-34,,9TGG963827B3D1,Started,35K,00:14:43,02:57,20.39,...,20.34,01:43:50,03:04,19.61,-,-,-,-,-,-


In [19]:
df_chi_full.to_csv(chi_data_path+f"/{CHI_NAME}2022_full.csv", index=False)

In [20]:
del df_chi_full, df_chi_res, df_chi_splits, chi_data_path