In [1]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update  &> /dev/null
apt-get install chromium chromium-driver  &> /dev/null

# Install selenium
pip install selenium  &> /dev/null

Executing: /tmp/apt-key-gpghome.Qg4vEU1rDw/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: public key "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.W3w72YIlqO/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: public key "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1
Executing: /tmp/apt-key-gpghome.9fDkFHNkV5/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: public key "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" imported
gpg: Total number processed: 1
gpg:               imported: 1




In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize
from datetime import datetime
from io import StringIO

from google.colab import drive
drive.mount('/content/drive')

service = Service(executable_path=r'/usr/bin/chromedriver')
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")

options.headless = True

Mounted at /content/drive


In [3]:
filePath ="/content/drive/My Drive/ElderWand/elderWandWTA.csv"
playersPath="/content/drive/My Drive/ElderWand/players/"

In [19]:
def getData(name):
  driver = webdriver.Chrome(service=service, options=options)
  url = "https://www.tennisabstract.com/cgi-bin/wplayer-classic.cgi?p="+name+"&f=ACareerqqC2"
  driver.get(url)
  element = driver.find_element(By.XPATH,'/html/body/div[2]/table/tbody/tr[2]/td[2]')
  element_html =  element.get_attribute('innerHTML')
  driver.quit()
  return element_html

def prepareData(element_html):
  df = pd.read_html(element_html)[0]

  deleteBadTournamets(df)

  data = df.iloc[:, [0,6]]
  data = data.dropna()
  data = data.reset_index(drop=False)
  data.rename(columns={'index': 'id'}, inplace=True)
  data.columns=['id', 'Date', 'Match']
  data[['Winner','Loser']] = data.Match.str.split('d\.', expand=True)
  data['Winner'] = data['Winner'].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True)
  data['Loser'] = data['Loser'].str.replace(r'\[.*?\]|\(.*?\)', '', regex=True)
  data = data.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
  data['Winner'] = data['Winner'].astype('string')
  data['Loser'] = data['Loser'].astype('string')
  data['Date'] = data['Date'].str.replace('‑', '-')
  data['Date'] = pd.to_datetime(data['Date'], format="%d-%b-%Y")
  data = data.sort_values(by=['id'],ascending=False)
  return data

def deleteBadTournamets(data):
  condition = (data['Date']=='21‑Dec‑1973' ) & (data['Tournament'] == 'Bonne Bell Cup' )
  rows = data[condition]
  data.drop(index=data[condition].index.tolist(), inplace=True)

def getRowsFromDateAndMatch(data, date,winner,loser):
  condition = (data['Date']==date ) & (data['Loser'] == loser)&(data['Winner']==winner)
  row_index = data[condition].index[0]
  return data.loc[row_index:]

def getLastWinner(surname,matches,linkName):
  data = getPlayerData(linkName)
  data = data.sort_values(by=['id'],ascending=True)
  result = data[data['Winner']!=surname]
  result = result.iloc[0]
  result['MatchesHeld'] = matches
  return result

def getResult(newData, linkName, data, matchesHeld,surname):
  result = newData[newData['Winner']!=surname]
  if len(result)!=0:
    result = result.iloc[0]
    result['MatchesHeld'] = matchesHeld+ (newData.iloc[0]['id']-result['id'])
    return result
  return getLastWinner(  surname,matchesHeld,linkName)

def getPlayerData(linkName):
  file_path = playersPath + linkName+'.csv'
  if os.path.isfile(file_path):
    data = pd.read_csv(file_path)
    data['Date'] = pd.to_datetime(data['Date'])
  else:
    element_html = getData(linkName)
    data = prepareData(element_html)
    savePlayerData(data, linkName)
  return data

def executeScript(fullName,date,loser,surname):
  linkName = fullName.replace(" ", "")
  data = getPlayerData(linkName)
  newData = getRowsFromDateAndMatch(data,date,surname,loser)
  return getResult(newData,linkName,data, 0,surname)

def savePlayerData(data, name):
  file_path = playersPath +name+'.csv'
  data.to_csv(file_path, index=False)

def saveToFile(df, fullName):
  df = df.drop(columns=['id','Match'])
  df.columns=[ 'Date', 'Overcome by', 'Wand owner', 'Matches held']
  newOrder=['Wand owner', 'Matches held', 'Overcome by', 'Date' ]
  df = df[newOrder]
  df['Wand owner'] = fullName
  if not os.path.isfile(filePath):
      df.to_csv(filePath, index=False)
  else:
      df.to_csv(filePath, mode='a', header=False, index=False)

def processFullName(fullName):
    if(fullName == "Beverly Baker Fleitz"):
      surname = "Baker Fleitz"
    elif(fullName == "Iris Riedel Kuhn"):
      surname = "Riedel Kuhn"
    elif(fullName == "Claudia Kohde Kilsch"):
      surname = "Kohde Kilsch"
    elif(fullName == "Arantxa Sanchez Vicario"):
      surname = "Sanchez Vicario"
    elif(fullName == "Manuela Maleeva Fragniere"):
      surname = "Maleeva Fragniere"
    elif(fullName == "Patricia Hy Boulais"):
      surname = "Hy Boulais"
    elif(fullName == "Brenda Schultz Mccarthy"):
      surname = "Schultz Mccarthy"
    elif(fullName == "Julie Halard Decugis"):
      surname = "Halard Decugis"
    elif(fullName == "Marianne Werdel Witmeyer"):
      surname = "Werdel Witmeyer"
    elif(fullName == "Kimiko Date Krumm"):
      surname = "Date Krumm"
    elif(fullName == "Mariaan De Swardt"):
      surname = "De Swardt"
    elif(fullName == "Ruxandra Dragomir Ilie"):
      surname = "Dragomir Ilie"
    elif(fullName == "Virginia Ruano Pascual"):
      surname = "Ruano Pascual"
    elif(fullName == "Silvia Farina Elia"):
      surname = "Farina Elia"
    elif(fullName == "Jelena Kostanic Tosic"):
      surname = "Kostanic Tosic"
    elif(fullName == "Maria Jose Martinez Sanchez"):
      surname = "Martinez Sanchez"
    elif(fullName == "Carla Suarez Navarro"):
      surname = "Suarez Navarro"
    elif(fullName == "Alison Riske Amritraj"):
      surname = "Riske Amritraj"
    elif(fullName == "Sara Sorribes Tormo"):
      surname = "Sorribes Tormo"
    elif(fullName == "Beatriz Haddad Maia"):
      surname = "Haddad Maia"
    elif(fullName == "Pam Casale Telford"):
      surname = "Casale Telford"
      fullName = "Pam Casale"
    elif(fullName=="Pam Casale"):
      fullName = "Pam Casale Telford"
      surname = "Casale Telford"
    else:
      surname = fullName.split(" ")[-1]
    return fullName,surname

def processBrokenPlayers(winner,loser,fullName,date, matchesHeld):
    result = {
          "id": [555],
          "Date": [date],
          "Match": ["nd"],
          "Winner": [winner],
          "Loser": [loser],
          "MatchesHeld": matchesHeld,
      }
    saveToFile(pd.DataFrame(result),fullName)
    return winner, fullName


In [31]:
#startData
fullName ="Billie Jean King"
loser = 'Rosie Casals'
date = datetime.strptime('28‑Sep‑1971', '%d‑%b‑%Y')
today_date = datetime.now()

if os.path.exists(filePath):
  os.remove(filePath)

while date<today_date:
  defeat = fullName+'    '+ str(date)+'    '+loser
  if(defeat == 'Dianne Fromholtz    1977-11-01 00:00:00    Chris Evert' ):
    fullName='Evonne Goolagong'
    date = datetime.strptime('13‑Mar‑1978', '%d‑%b‑%Y')
    defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Anne Smith    1982-01-04 00:00:00    Tracy Austin'):
      fullName,loser= processBrokenPlayers('Martina Navratilova','Smith','Anne Smith',date,3)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Anne Miller    1996-05-20 00:00:00    Kimiko Date Krumm'):
      fullName,loser= processBrokenPlayers('Katarina Studenikova','Miller','Anne Miller',date,2)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Anne Miller    1998-03-30 00:00:00    Mary Pierce'):
      fullName,loser= processBrokenPlayers('Lisa Raymond','Miller','Anne Miller',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Galina Fokina    2002-05-06 00:00:00    Klara Koukalova'):
      fullName,loser= processBrokenPlayers('Amanda Hopmans','Fokina','Galina Fokina',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Nadia Petrova    2005-11-07 00:00:00    Maria Sharapova'):
      fullName,loser= processBrokenPlayers('Lindsay Davenport','Petrova','Nadia Petrova',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Lindsay Davenport    2005-11-07 00:00:00    Nadia Petrova'):
      fullName,loser= processBrokenPlayers('Mary Pierce','Davenport','Lindsay Davenport',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Sofia Kenin    2020-02-07 00:00:00    Anastasija Sevastova'):
      date = datetime.strptime('17‑Feb‑2020', '%d‑%b‑%Y')
      fullName,loser= processBrokenPlayers('Elena Rybakina','Kenin','Sofia Kenin',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Maria Sakkari    2021-11-10 00:00:00    Iga Swiatek'):
      fullName,loser= processBrokenPlayers('Anett Kontaveit','Sakkari','Maria Sakkari',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Karolina Pliskova    2021-11-10 00:00:00    Garbine Muguruza'):
      date = datetime.strptime('10‑Jan‑2022', '%d‑%b‑%Y')
      fullName,loser= processBrokenPlayers('Daria Kasatkina','Muguruza','Garbine Muguruza',date,2)
      defeat = fullName+'    '+ str(date)+'    '+loser

  if(defeat=='Aryna Sabalenka    2024-11-04 00:00:00    Jasmine Paolini'):
      date = datetime.strptime('04‑Nov‑2024', '%d‑%b‑%Y')
      fullName,loser= processBrokenPlayers("Coco Gauff",'Sabalenka','Aryna Sabalenka',date,1)
      defeat = fullName+'    '+ str(date)+'    '+loser


  fullName, surname = processFullName(fullName)
  defeat = fullName+'    '+ str(date)+'    '+loser
  print(defeat)

  result = executeScript(fullName,date,loser,surname)
  saveToFile(pd.DataFrame(result).T,fullName)
  loser = fullName
  date = result['Date']
  fullName = result['Winner']

  if(loser == "Pam Casale Telford"):
    loser = "Pam Casale"
  if(defeat=="Ashleigh Barty    2022-01-17 00:00:00    Madison Keys"):
    date=datetime.strptime('21‑Feb‑2022', '%d‑%b‑%Y')
    loser="Shelby Rogers"
    fullName ="Coco Gauff"

  if(defeat == "Coco Gauff    2024-11-04 00:00:00    Aryna Sabalenka"):
    break

Billie Jean King    1971-09-28 00:00:00    Rosie Casals
Francoise Durr    1971-12-01 00:00:00    Billie Jean King
Valerie Ziegenfuss    1971-12-07 00:00:00    Francoise Durr
Rosie Casals    1971-12-07 00:00:00    Valerie Ziegenfuss
Kerry Reid    1971-12-07 00:00:00    Rosie Casals
Billie Jean King    1972-01-12 00:00:00    Kerry Reid
Francoise Durr    1972-01-19 00:00:00    Billie Jean King
Rosie Casals    1972-01-19 00:00:00    Francoise Durr
Virginia Wade    1972-01-26 00:00:00    Rosie Casals
Helen Gourlay    1972-02-01 00:00:00    Virginia Wade
Wendy Overton    1972-02-01 00:00:00    Helen Gourlay
Billie Jean King    1972-02-01 00:00:00    Wendy Overton
Chris Evert    1972-02-01 00:00:00    Billie Jean King
Nancy Richey    1972-02-24 00:00:00    Chris Evert
Billie Jean King    1972-03-21 00:00:00    Nancy Richey
Nancy Richey    1972-03-29 00:00:00    Billie Jean King
Billie Jean King    1972-05-03 00:00:00    Nancy Richey
Francoise Durr    1972-08-01 00:00:00    Billie Jean King
Ro

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['MatchesHeld'] = matchesHeld+ (newData.iloc[0]['id']-result['id'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['MatchesHeld'] = matchesHeld+ (newData.iloc[0]['id']-result['id'])


Angelique Kerber    2018-07-02 00:00:00    Jelena Ostapenko
Alize Cornet    2018-08-06 00:00:00    Angelique Kerber
Ashleigh Barty    2018-08-06 00:00:00    Alize Cornet
Simona Halep    2018-08-06 00:00:00    Ashleigh Barty
Kiki Bertens    2018-08-13 00:00:00    Simona Halep
Marketa Vondrousova    2018-08-27 00:00:00    Kiki Bertens
Lesia Tsurenko    2018-08-27 00:00:00    Marketa Vondrousova
Naomi Osaka    2018-08-27 00:00:00    Lesia Tsurenko
Karolina Pliskova    2018-09-17 00:00:00    Naomi Osaka
Qiang Wang    2018-09-24 00:00:00    Karolina Pliskova
Anett Kontaveit    2018-09-24 00:00:00    Qiang Wang
Aryna Sabalenka    2018-09-24 00:00:00    Anett Kontaveit
Qiang Wang    2018-10-01 00:00:00    Aryna Sabalenka
Caroline Wozniacki    2018-10-01 00:00:00    Qiang Wang
Karolina Pliskova    2018-10-22 00:00:00    Caroline Wozniacki
Elina Svitolina    2018-10-22 00:00:00    Karolina Pliskova
Aliaksandra Sasnovich    2018-12-31 00:00:00    Elina Svitolina
Donna Vekic    2018-12-31 00:00:0

In [25]:
# driver = webdriver.Chrome(service=service, options=options)
# url = "https://www.tennisabstract.com/cgi-bin/wplayer-classic.cgi?p=JessicaPegula&f=ACareerqq"
# driver.get(url)
# element = driver.find_element(By.XPATH,'/html/body/div[2]/table/tbody/tr[2]/td[2]')
# element_html =  element.get_attribute('innerHTML')
# driver.quit()
# data = prepareData(element_html)
# savePlayerData(data, "Jessica Pegula")

  df = pd.read_html(element_html)[0]
