In [1]:
import procyclingstats as pcs
import pandas as pd

In [2]:
from typing import List, Tuple, Union
from procyclingstats.errors import UnexpectedParsingError
from procyclingstats.table_parser import TableParser


def parse(self, fields: Union[List[str], Tuple[str, ...]]) -> None:

    raw_table = []
    for _ in range(self.table_length):
        raw_table.append({})

    for field in fields:
        if field != "class":
            parsed_field_list = getattr(self, field)()
        # special case when field is called class
        else:
            parsed_field_list = getattr(self, "class_")()
        # field wasn't found in every table row, so isn't matching table
        # rows correctly
        if len(parsed_field_list) != self.table_length:
            message = f"Field '{field}' wasn't parsed correctly"
            raise UnexpectedParsingError(message)

        for row, parsed_value in zip(raw_table, parsed_field_list):
            row[field] = parsed_value

    # remove unwanted rows
    for row in raw_table:
        self.table.append(row)

    # if "time" in fields and self.table:
    # self._make_times_absolute()


TableParser.parse = parse

In [None]:
rider = pcs.Rider("rider/bruno-surra")
rider

In [None]:
df_races = pd.read_csv("dataset/races.csv")
df_races.info()

In [5]:
# get _url which have negative delta
bad_urls = df_races.loc[df_races["delta"] < 0, "_url"]
bad_urls = bad_urls.unique()

In [6]:
def time_to_seconds(time):
    h, m, s = time.split(":")
    h = int(h) * 3600
    m = int(m) * 60
    s = int(s)
    sign = -1 if m < 0 or h < 0 or s < 0 else 1
    return sign * (abs(h) + abs(m) + abs(s))

In [None]:
for RACE_URL in bad_urls:
    stage = pcs.Stage(f"race/{RACE_URL}")
    print(stage)
    ranking = stage.results("rider_url", "time", "rank")
    # for i in sorted(ranking,key = lambda x: x['rank']):
    #    print(i)

    # convert ranking to pandas table, ranking is a list of objects
    df_ranking = pd.DataFrame(ranking)

    df_ranking["time"] = df_ranking["time"].apply(time_to_seconds)

    # sum first time to all other negative times
    # first time is the time of the winner
    first_time = df_ranking["time"].loc[0]

    # sum first time to all other negative times
    df_ranking["time"] = df_ranking["time"].apply(
        lambda x: x if x > 0 else first_time + x
    )

    df_ranking.loc[0, "time"] = 0

    df_ranking.rider_url = df_ranking.rider_url.apply(
        lambda x: x.split("/")[-1]
    )

    for i in range(len(df_ranking)):
        rider = df_ranking.loc[i, "rider_url"]
        time = df_ranking.loc[i, "time"]

        df_races.loc[
            (df_races._url == RACE_URL) & (df_races.cyclist == rider),
            "delta",
        ] = time

    print(
        df_races.loc[
            (df_races._url == RACE_URL),
            "delta",
        ]
    )

In [None]:
# check if delta contains positive floats
print(all(x.is_integer() for x in df_races.delta.dropna()))

## Teams imputation (scraping)

Try solving the imputation of teams

In [None]:
# create a file for output
with open("output.txt", "w") as f:
    f.write("This is the output file for storing prints.\n")

In [31]:
df_bad_teams = df_races[df_races.cyclist_team.isna()]
bad_urls = df_bad_teams._url.unique()

In [None]:
# arrivato a 3300 CONTINUA DA Lì
with open("output.txt", "w") as f:
    for idx, url in enumerate(bad_urls):
        if idx % 100 == 0:
            print(f"Processing {idx}/{len(bad_urls)}")
        stage = pcs.Stage(f"race/{url}")

        ranking = stage.results("rider_url", "rank", "team_name")
        df_ranking = pd.DataFrame(ranking)
        df_ranking.rider_url = df_ranking.rider_url.apply(
            lambda x: x.split("/")[-1]
        )
        df_url = df_bad_teams[df_bad_teams._url == url]

        for i in range(len(df_url)):
            rider = df_url.iloc[i].cyclist
            try:
                team = df_ranking.loc[
                    df_ranking.rider_url == rider
                ].team_name.values[0]
            except IndexError:
                print(stage)
                print(f"Rider {rider} not found in ranking")
                f.write(f"{stage.__str__()}\n")
                f.write(f"Rider {rider} not found in ranking\n")
                continue
            df_bad_teams.loc[
                (df_bad_teams._url == url) & (df_bad_teams.cyclist == rider),
                "cyclist_team",
            ] = team

        # check if for this url all teams are filled
        # assert all(df_bad_teams[df_bad_teams._url == url].cyclist_team.notna())



In [33]:
# save the results
df_bad_teams.to_csv("dataset/bad_teams_2.csv", index=False)

In [None]:
df_bad_teams.info()