# Scrapping the football data from TransferMarkt.com website using python

In [8]:
# Importing the modules and Pakages required

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup

## Method-1 - Using CSS Selector

In [26]:
# Step-1: Search the data on the TransferMarkt website we want to scrape.

# Step-2: Copy the URL from the website. The number at the last of the URL is the Player ID to fetch all the data related to 
# the player.

# Step-3: Spilt the URL after every / to get separate elements (list of elements).

url = "https://www.transfermarkt.co.in/erling-haaland/profil/spieler/418560"
player_id = url.split('/')[-1]

In [28]:
# Step-4: Use the headers to resolve the issues of the browser issues and permissions. Headers will help in fetching the 
# information using our system information like Browser version.

# Go to WhatismyBrowser.com -> Go to User Agent -> Create variable headers and paste User agent value in it -> It will look 
# like a dictionary.

headers = {"User-Agent": 
           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"}

In [5]:
# Step-5: After setting user agent, we will now make a request to the web page

response = requests.get(url, headers=headers)

In [6]:
# Step-6: Check if the status is correct for the request made (If status_code-200 means request is successful)
response.status_code

200

In [9]:
# Step-7: If response is 200, we will convert the response into a BeautifulSoup to get the data from the webpage

# This BeautifulSoup will turn the html page into a soup object, which will help in extracting the data from the webpage

soup = BeautifulSoup(response.content, "html.parser")

In [15]:
# Step-8: We will fetch the data using the css selector from the webpage using the soup. Here we will object and parse the 
# css value of the data
# We will fetch the player name from the TransferMarkt websiite here 

player_name = soup.select_one('h1[class="data-header__headline-wrapper"]').text.split('\n')[-1].strip()

# Printing the value of the player name

player_name

'Erling Haaland'

In [20]:
# Step-9: Using the same process, we will fetch the shirt number of the player

player_number = soup.select_one('span[class="data-header__shirt-number"]').text.replace('#', '').strip()

# Printing the value of player_number
player_number

# Converting the value into integer
int(player_number)

9

## Method-2 - Using Regular Expressions (Regex)

In [22]:
# Using regex, we will try to fetch out the contract expiry of the player. 

# Step-1: Go to the webiste and open View Page Source by clicking right on the screen. In the View Page Source, search for 
# Contract Expires and copy value above and below the actual value.

# Step-2: Go to Regex101 and create a python flavor for that using the available information.
# In the Regex expression, first we need to select the Matching content with our requirement and then create a group of the 
# value asssociated with the content. 

# In the regex expression, first we will put the fetching value, then that will equal to select complete span using (.*) and 
# (__content\") which will fetcch the whole span. After that to select the value use (.*?) to select the match exists and then 
# to select only particular span or class use (</span>) at the end.

# Step-3: Once we have the python regex expression, using re package we will fetch the value. We will use soup in str to get 
# the value in the form of string

player_contract_expiry = re.search("Contract expires: .*__content\">(.*?)</span>", str(soup)).group(1)
player_contract_expiry

'Jun 30, 2027'

## Method-3 - Using API 

In [23]:
# In this method, we will use the API present for the data we need to fetch. 

# Step-1: Go to the Inspect (Google Developer Tool) on the website -> Go to the Network tab, where all API related data is 
# present, Find the required API among all the present APIs. 

# Step-2: Once you found the required API, preview it and cross-check that it is the right one. Then, right clcik on it and 
# Copy it as cURL(bash) and convert it into JSON first using curl converter online and then convert that it into Python code. 
# From that Python code, identify which part is required to insert into the code and use that. (Mostly required is the Response 
# code part)

response = requests.get(
    f'https://www.transfermarkt.co.in/ceapi/marketValueDevelopment/graph/{player_id}',
    headers=headers,
)

In [24]:
# Check if the status is correct for the request made (If status_code-200 means request is successful)

response.status_code

200

In [25]:
# Printing the value captured by the API request

response.json()

{'list': [{'x': 1482015600000,
   'y': 16000000,
   'mw': '₹1.6 Cr',
   'datum_mw': 'Dec 18, 2016',
   'verein': 'Bryne FK',
   'age': '16',
   'wappen': 'https://tmssl.akamaized.net/images/wappen/profil/1057.png?lm=1480871779'},
  {'x': 1513983600000,
   'y': 24000000,
   'mw': '₹2.4 Cr',
   'datum_mw': 'Dec 23, 2017',
   'verein': 'Molde FK',
   'age': '17',
   'wappen': 'https://tmssl.akamaized.net/images/wappen/profil/687.png?lm=1409159512'},
  {'x': 1536530400000,
   'y': 160000000,
   'mw': '₹16 Cr',
   'datum_mw': 'Sep 10, 2018',
   'verein': 'Molde FK',
   'age': '18',
   'wappen': ''},
  {'x': 1546124400000,
   'y': 400000000,
   'mw': '₹40 Cr',
   'datum_mw': 'Dec 30, 2018',
   'verein': 'Molde FK',
   'age': '18',
   'wappen': ''},
  {'x': 1559512800000,
   'y': 400000000,
   'mw': '₹40 Cr',
   'datum_mw': 'Jun 3, 2019',
   'verein': 'Red Bull Salzburg',
   'age': '18',
   'wappen': 'https://tmssl.akamaized.net/images/wappen/profil/409_1557781653.png?lm=1557781653'},
  {'x':

In [29]:
# Checking what all keys are available in the JSON fetched

response.json().keys()

dict_keys(['list', 'current', 'highest', 'highest_date', 'last_change', 'details_url', 'thread', 'translations'])