In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

import requests
from bs4 import BeautifulSoup

# Jugadores reales

In [None]:

url = "https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?land_id=0&ausrichtung=alle&spielerposition_id=alle&altersklasse=u21&jahrgang=0&kontinent_id=0&plus=1"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

response = requests.get(url, headers=headers)
response

In [None]:
#Es la funcion para scrappear las 20 páginas de Transfermarkt de las que vamos a sacar la información, en este caso de los u-21

import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_data_from_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    players = []
    rows = soup.select("table.items > tbody > tr")

    for row in rows:
        # Verificar y extraer el nombre del jugador
        player_name_elem = row.select_one('td.hauptlink')
        player_name = player_name_elem.text.strip() if player_name_elem else "N/A"

        # Verificar y extraer la edad
        age_elem = row.select_one('td.zentriert:nth-child(3)')
        age = age_elem.text.strip() if age_elem else "N/A"

        # Verificar y extraer la nacionalidad
        nationality_elem = row.select_one('.zentriert .flaggenrahmen')
        nationality = nationality_elem.get('title').strip() if nationality_elem else "N/A"

        club_elem = row.select_one('td.zentriert a')
        club = club_elem.get('title').strip() if club_elem else "N/A"

        # Verificar y extraer el valor de mercado
        market_value_elem = row.select_one('.rechts.hauptlink')
        market_value = market_value_elem.text.strip() if market_value_elem else "N/A"

        players.append({
            "Player": player_name,
            "Age": age,
            "Nat.": nationality,
            "Club": club,
            "Market value": market_value
        })

    return players

# Inicializar una lista para almacenar todos los datos
all_players = []

# Bucle para recorrer las 20 páginas
for page in range(1, 21):
    url = f"https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?page={page}&land_id=0&ausrichtung=alle&spielerposition_id=alle&altersklasse=u21&jahrgang=0&kontinent_id=0&plus=1"
    all_players.extend(extract_data_from_page(url))

# Convertir los datos a un DataFrame de pandas
df_sub21 = pd.DataFrame(all_players)

In [None]:
#De un Dataframe ya existente, importamos la tabla de "aparciones"

df_apariciones = pd.read_csv ("appearances.csv")

In [None]:
#Agregamos este Df a nuestra tabla de u-21, mediante el nombre del jugador. De esta manera tenemos 3 columnas mas, minute_played, goals y assists

df_filtered = df_apariciones[df_apariciones['player_name'].isin(df_sub21['Player'])]

df_filtered = df_filtered[['player_name', 'minutes_played', 'goals', 'assists']]

df_aggregated = df_filtered.groupby('player_name').agg({
    'minutes_played': 'sum',
    'goals': 'sum',
    'assists': 'sum'
}).reset_index()

# Renombrar la columna 'player_name' para coincidir con el DataFrame U21
df_aggregated = df_aggregated.rename(columns={'player_name': 'Player'})

# Unir el DataFrame agregado con u21_df en la columna 'Players'
merged_df = pd.merge(df_sub21, df_aggregated, on='Player', how='left')

In [None]:
#De un df ya creado importamos la tabla de jugadores

df_players = pd.read_csv ("players.csv")

In [None]:
#Mediante el nombre, agregamos las columnas de position, foot y height a nuestro dataframe original

df_players = df_players.rename(columns={'name': 'Player'})
merged_df = pd.merge(merged_df, df_players[['Player', 'position', 'foot', 'height_in_cm']], on='Player', how='left')
merged_df = merged_df.reset_index(drop=True)

In [None]:
#Realizamos una limpieza de la tabla despues de agrergar las nuevas columnas. Revisamos si hay nulls

merged_df.isnull().sum()

In [None]:
#Borramos todos los nulls y guardamos

merged_df.dropna (inplace=True)

In [None]:
merged_df = merged_df.drop_duplicates(subset='Player', keep='first')

merged_df = merged_df.reset_index(drop=True)

In [None]:
df_u21 = merged_df.head(350)

In [None]:
df_u21['Age'] = pd.to_numeric(df_u21['Age'], errors='coerce')

In [None]:
df_u21['Market value'] = df_u21['Market value'].replace({'€': '', 'm': '000000', 'k': '000'}, regex=True)

df_u21['Market value'] = pd.to_numeric(df_u21['Market value'], errors='coerce')

In [None]:
#Podemos ver que ya cambió el typo de cada columna

df_u21.dtypes

In [None]:
df_u21.to_csv("real_players_u21.csv", index= False)

# Jugadores FIFA

In [None]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup

import time

In [None]:
#Esto lo usamos para el Scrapping. El URL es el de Transfermrkt, si nos devuelve una respuesta de 200 es porque es apta

url = "https://sofifa.com/players?type=all&ael=15&aeh=21&col=vl&sort=desc&r=240049&set=true&showCol%5B0%5D=ae&showCol%5B1%5D=hi&showCol%5B2%5D=pf&showCol%5B3%5D=oa&showCol%5B4%5D=pt&showCol%5B5%5D=bp&showCol%5B6%5D=vl&showCol%5B7%5D=wg&showCol%5B8%5D=tt&showCol%5B9%5D=pac&showCol%5B10%5D=sho&showCol%5B11%5D=pas&showCol%5B12%5D=dri&showCol%5B13%5D=def&showCol%5B14%5D=phy&showCol%5B15%5D=tg&offset=0"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

response = requests.get(url, headers=headers)
response

In [None]:
df = pd.DataFrame(columns=['name','nation','position','age','overall','potential','total_gk', 'total_stats', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physical', 'team', 'altura', 'foot', 'cost'])

In [None]:
def get_info(player):
    nombre = player.find('a')['data-tippy-content']
    position = player.find('span').text
    age = player.find('td', attrs={'data-col' : 'ae'}).text
    team = player.find_all('td')[5].find('a').text
    nation = player.find('img', class_=['flag loaded','flag'])['title']
    if len([ stats.text for stats in player.find_all('em') ]) == 11:
        lista = [ stats.text for stats in player.find_all('em') ]
        del lista[2]

        overall, potential, total_gk, total_stats, pace, shooting, passing, dribbling, defending, physical = lista
    else:
        overall, potential, total_gk, total_stats, pace, shooting, passing, dribbling, defending, physical =  [ stats.text for stats in player.find_all('em') ] 
    
    altura = player.find('td', attrs={'data-col' : 'hi'}).text.split()[0][:-2]
    foot  = player.find('td', attrs={'data-col' : 'pf'}).text
    cost = player.find('td', attrs={'data-col' : 'vl'}).text[1:-1]
    return [nombre, nation, position, age, overall, potential, total_gk, total_stats, pace, shooting, passing, dribbling, defending, physical, team, altura, foot, cost]

In [None]:
# Configura Selenium con el navegador Chrome
driver = webdriver.Chrome()

try:
    driver.get(url)

    try:
        # Esperar a que el botón de consentimiento esté presente y hacer clic
        consent_button = driver.find_element(By.CSS_SELECTOR, "button.fc-button.fc-cta-consent.fc-primary-button")
        consent_button.click()
        print("Botón de consentimiento de cookies encontrado y clicado.")
    except NoSuchElementException:
        print("No se encontró el botón de consentimiento de cookies. Puede que ya se haya cerrado o no sea necesario.")
    
    page_number = 1
    while True:
        # Espera unos segundos para asegurarte de que la página ha terminado de cargar
        time.sleep(5)

        # Obtener el HTML de la página actual
        html = driver.page_source
        
        soup = BeautifulSoup(html, 'html.parser')
        
        df_temp = pd.DataFrame(columns=['name','nation','position','age','overall','potential','total_gk', 'total_stats', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physical', 'team', 'altura', 'foot', 'cost'])
        for i, player in enumerate(soup.find_all('tr')[1:]):
            df_temp.loc[i] = get_info(player)
        df = pd.concat([df, df_temp], ignore_index=True, axis=0)

        print(f"HTML de la página {page_number} capturado y guardado con éxito.")
        
        # Intentar encontrar y hacer clic en el botón "Next"
        try:
            next_button = driver.find_element(By.XPATH, "//a[contains(text(), 'Next ')]")
            next_href = next_button.get_attribute('href')
            print(f"Href del botón 'Next' en la página {page_number}: {next_href}")

            # Navegar a la siguiente página usando el href extraído
            driver.get(next_href)
            page_number += 1
        except NoSuchElementException:
            print("No se encontró el botón 'Next', se asume que no hay más páginas.")
            break

finally:
    # Cierra el navegador manualmente después de haber obtenido el HTML
    driver.quit()

In [None]:
df

In [None]:
df.to_csv('fifa_players.csv', index=False)

In [2]:
df_u21fifa = pd.read_csv("../Datos/fifa_players.csv")

In [5]:
df_u21fifa.dropna(inplace= True)

In [10]:
df_u21fifa = df_u21fifa.head(2000)

In [11]:
df_u21fifa

Unnamed: 0,name,nation,position,age,overall,potential,total_gk,total_stats,pace,shooting,passing,dribbling,defending,physical,team,altura,foot,cost
0,Jamal Musiala,Germany,CAM,20,86,93,42,2090,85,75,76,91,63,61,FC Bayern München,184,Right,134.5
1,Jude Bellingham,England,CAM,20,88,92,48,2295,80,83,81,87,78,83,Real Madrid,186,Right,128.5
2,Florian Wirtz,Germany,CAM,20,87,92,57,2089,80,76,86,88,50,63,Bayer 04 Leverkusen,177,Right,118.5
3,Pedro González López,Spain,CM,20,86,92,46,2138,78,69,82,88,70,74,FC Barcelona,174,Right,105.0
4,Bukayo Saka,England,RW,21,86,90,50,2196,85,82,81,88,60,70,Arsenal,178,Left,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Thimothée Lo-Tutala,France,GK,20,63,75,314,918,64,62,61,67,21,60,Doncaster Rovers,186,Right,1.0
1996,Manuel Morillo León,Spain,ST,19,62,79,59,1502,76,61,51,65,21,47,Real Betis,182,Right,1.0
1997,Antonino Jastin García López,Portugal,LM,19,62,78,47,1439,77,55,53,65,23,43,Girona,180,Right,1.0
1998,Iker Almena Horcajo,Spain,RM,19,62,80,51,1450,74,54,56,63,25,46,Girona,176,Left,1.0


In [12]:
df_u21fifa.isna().sum()

name           0
nation         0
position       0
age            0
overall        0
potential      0
total_gk       0
total_stats    0
pace           0
shooting       0
passing        0
dribbling      0
defending      0
physical       0
team           0
altura         0
foot           0
cost           0
dtype: int64

In [13]:
df_u21fifa.to_csv ("fifa_players_u21.csv", index=False)