In [0]:
!pip install bs4


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


#About this small project
This project entails extracting data from a Pokémon website using web scraping techniques to gather information such as Pokémon attributes, abilities, and statistics. Once extracted, the data is transformed into a structured format suitable for analysis, possibly using Python libraries like pandas for cleaning and structuring.

Next, the transformed data is loaded into a Databricks database table, leveraging Databricks utilities or SQL commands to define the schema and store the dataset securely. This step ensures that the data is easily accessible and queryable within the Databricks environment.

Finally, for visualization purposes, sample visualizations are generated to explore insights such as distributions of Pokémon types, strengths, or evolutionary trends. These visualizations offer a clear, graphical representation of the data, aiding in understanding and decision-making based on Pokémon characteristics.

Overall, this project integrates web scraping, data transformation, database management, and visualization techniques to analyze Pokémon data efficiently within the Databricks platform.

###Create functions to EXTRACT DATA FROM POKEMON WebURL to dataframe , do some transformation , then load it to databricks database table

In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

def extract_pokedex_data(url):
    """
    Extract Pokémon data from the given URL.

    Parameters:
    url (str): The URL of the webpage containing the Pokémon data.

    Returns:
    pandas.DataFrame or None: A DataFrame containing the Pokémon data if the 
                              table is found; otherwise, None.
    """
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        pokedex_table = soup.find('table', {'id': 'pokedex'})
        
        if pokedex_table:
            headers = [header.text.strip() for header in pokedex_table.find_all('th')]
            rows = pokedex_table.find_all('tr')[1:]  # skip header row
            data = [[cell.text.strip() for cell in row.find_all('td')] for row in rows]

            df = pd.DataFrame(data, columns=headers)
            return df
        else:
            print(f"Table with id='pokedex' not found on {url}.")
            return None
    else:
        print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
        return None
    
def transform_data(pokedex_df):
    """
    Transform the extracted Pokémon data.

    This function takes a DataFrame containing the raw Pokémon data,
    splits the 'Type' column into 'Type1' and 'Type2', converts relevant
    columns to numeric types, and rearranges the columns.

    Parameters:
    pokedex_df (pandas.DataFrame): A DataFrame containing the raw Pokémon data.

    Returns:
    pandas.DataFrame: A DataFrame with the transformed Pokémon data.
    """
    df = pokedex_df

    df[["Type1", "Type2"]] = df['Type'].str.split(" ",expand = True)
    df.drop(columns=['Type'], inplace=True)

    df['#'] = pd.to_numeric(df['#']).astype('Int64')
    df['Total'] = pd.to_numeric(df['Total']).astype('Int64')
    df['HP'] = pd.to_numeric(df['HP']).astype('Int64')
    df['Attack'] = pd.to_numeric(df['Attack']).astype('Int64')
    df['Defense'] = pd.to_numeric(df['Defense']).astype('Int64')
    df['Sp. Atk'] = pd.to_numeric(df['Sp. Atk']).astype('Int64')
    df['Sp. Def'] = pd.to_numeric(df['Sp. Def']).astype('Int64')
    df['Speed'] = pd.to_numeric(df['Speed']).astype('Int64')
    
    df = df[['#', 'Name','Type1', 'Type2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def','Speed']]
    return df
    
def load_to_existing_table(spark, cleaned_pokedex, table_name):
    """
    Load the transformed Pokémon data into an existing table in Databricks.

    Parameters:
    spark (SparkSession): Spark session object.
    cleaned_pokedex (pandas.DataFrame): A DataFrame containing the transformed Pokémon data.
    table_name (str): The name of the existing table in Databricks.

    Returns:
    None
    """
    try:
        schema = StructType([
            StructField("#", IntegerType(), True),
            StructField("Name", StringType(), True),
            StructField("Type1", StringType(), True),
            StructField("Type2", StringType(), True),
            StructField("Total", IntegerType(), True),
            StructField("HP", IntegerType(), True),
            StructField("Attack", IntegerType(), True),
            StructField("Defense", IntegerType(), True),
            StructField("SpAtk", IntegerType(), True),
            StructField("SpDef", IntegerType(), True),
            StructField("Speed", IntegerType(), True)
        ])

        spark_df = spark.createDataFrame(cleaned_pokedex, schema=schema)


        spark_df.createOrReplaceTempView("temp_pokedex_view")


        spark.sql(f"INSERT INTO {table_name} SELECT * FROM temp_pokedex_view")

        print(f"Data loaded successfully into existing table '{table_name}' in Databricks.")

    except Exception as e:
        print(f"An error occurred: {str(e)}")




#Create table and check if created  

In [0]:
%sql

CREATE OR REPLACE TABLE pokedex (
    `#` INT,
    Name STRING,
    Type1 STRING,
    Type2 STRING,
    Total INT,
    HP INT,
    Attack INT,
    Defense INT,
    `SpAtk` INT,
    `SpDef` INT,
    Speed INT
)
USING DELTA
LOCATION 'dbfs:/user/delta';

In [0]:
%sql
-- check if table is created
SELECT * FROM pokedex

#,Name,Type1,Type2,Total,HP,Attack,Defense,SpAtk,SpDef,Speed


#Execute 

In [0]:
if __name__ == "__main__":
    url = "https://pokemondb.net/pokedex/all"
    table_name = "default.pokedex"

    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("PokedexDataLoader") \
        .getOrCreate()

    try:
        # Extract and transform Pokémon data
        pokedex_df = extract_pokedex_data(url)
        cleaned_pokedex = transform_data(pokedex_df)

        # Load transformed data into existing table
        load_to_existing_table(spark, cleaned_pokedex, table_name)

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    finally:
        # Stop Spark session
        spark.stop()

##This query checks whether the Databricks table has been populated with data.

In [0]:
%sql

SELECT * 
FROM default.pokedex
ORDER BY `#`;

#,Name,Type1,Type2,Total,HP,Attack,Defense,SpAtk,SpDef,Speed
1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60
3,Venusaur,Grass,Poison,525,80,82,83,100,100,80
3,Venusaur Mega Venusaur,Grass,Poison,625,80,100,123,122,120,80
4,Charmander,Fire,,309,39,52,43,60,50,65
5,Charmeleon,Fire,,405,58,64,58,80,65,80
6,Charizard,Fire,Flying,534,78,84,78,109,85,100
6,Charizard Mega Charizard X,Fire,Dragon,634,78,130,111,130,85,100
6,Charizard Mega Charizard Y,Fire,Flying,634,78,104,78,159,115,100
7,Squirtle,Water,,314,44,48,65,50,64,43


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
%sql

SELECT Type1, AVG(Attack) AS avg_atk
FROM default.pokedex
GROUP BY Type1
ORDER BY Type1;


Type1,avg_atk
Bug,70.91208791208791
Dark,85.48214285714286
Dragon,103.81632653061224
Electric,73.14864864864865
Fairy,71.06451612903226
Fighting,104.96
Fire,84.47368421052632
Flying,81.9
Ghost,72.0
Grass,77.7433628318584


Databricks visualization. Run in Databricks to view.