# Data retrieval from MUL site for Mech Assist application
This code retrieves the whole list of units from http://masterunitlist.info/ and their Alpha Strike parameters.

Note: code was done with chatGPT asistance

In [9]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
import pandas as pd
from tqdm.auto import tqdm

Define the URL 

In [2]:
# Define the URL for all battlemechs
search_url = "http://www.masterunitlist.info/Unit/Filter?Types=18"

Connect and get confirmation of connection

In [23]:
# Send an HTTP GET request to the URL
response = requests.get(search_url)
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    print("Access granted!")
else:
    print("Failed to retrieve the webpage.")

Access granted!


Make a list of all mechs

In [35]:
# Find all units
unit_names = [
    a.text for a in soup.find_all('a', href=True) if a['href'].startswith("/Unit/Details/")
]

4018


['Cameroon  ',
 'Daemon  ',
 'Hatchetman HCT-3G',
 'Hermes II HER-2X',
 'Phoenix Hawk LAM C']

Make a dataframe to collect units parameters

In [None]:
dataset = pd.DataFrame(columns = ["Name", "Model", "Role", "PV", "Type", "Size", "Move",\
                  "Short", "Medium", "Long", "Overheat", "Armor", "Structure",\
                  "Specials", "ImageURL"])

Fill `dataset` with units' information:

In [111]:
for i, unit_name in enumerate(unit_names):
    if (i%50 == 0):
        percent_complete = (i + 1) / len(unit_urls) * 100
        print(f"Parsing {i + 1} of {len(unit_urls)} units - {percent_complete:.2f}% complete")

    # Define the URL with the query parameters
    url = f"https://masterunitlist.azurewebsites.net/Unit/QuickList?Name={unit_name}"

    # Send an HTTP GET request with the parameters
    response = requests.get(url, stream=True)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:

        # Parse the JSON response
        data = response.json()

        # Identify which unit should be parsed as one request may return several units. For example:
        # https://masterunitlist.azurewebsites.net/Unit/QuickList?Name=Phoenix%20Hawk%20PXH-1k
        
        for item in data.get("Units"):
            if (item.get("Name")==unit_name):
                index = data.get("Units").index(item)

        # Extract the desired information
        parsed_unit = {
            "Name": data.get("Units")[index].get("Name"),
            "Model": data.get("Units")[index].get("Variant"),
            "Role": data.get("Units")[index].get("Role").get("Name"),
            "PV": data.get("Units")[index].get("BFPointValue", 0),
            "Type": data.get("Units")[index].get("BFType", ""),
            "Size": data.get("Units")[index].get("BFSize", 0),
            "Move": data.get("Units")[index].get("BFMove", ""),
            "Short": data.get("Units")[index].get("BFDamageShort", 0),
            "Medium": data.get("Units")[index].get("BFDamageMedium", 0),
            "Long": data.get("Units")[index].get("BFDamageLong", 0),
            "Overheat": data.get("Units")[index].get("BFOverheat", 0),
            "Armor": data.get("Units")[index].get("BFArmor", 0),
            "Structure": data.get("Units")[index].get("BFStructure", 0),
            "Specials": data.get("Units")[index].get("BFAbilities", ""),
            "ImageURL": data.get("Units")[index].get("ImageUrl", "")
        }
            
        # Add unit parameters to the dataframe
        dataset = pd.concat([dataset, pd.DataFrame([parsed_unit])], ignore_index=True)

print(f"Data has been saved to 'datase' DataFrame")

Parsing 1 of 4018 units - 0.02% complete
Parsing 51 of 4018 units - 1.27% complete
Parsing 101 of 4018 units - 2.51% complete
Parsing 151 of 4018 units - 3.76% complete
Parsing 201 of 4018 units - 5.00% complete
Parsing 251 of 4018 units - 6.25% complete
Parsing 301 of 4018 units - 7.49% complete
Parsing 351 of 4018 units - 8.74% complete
Parsing 401 of 4018 units - 9.98% complete
Parsing 451 of 4018 units - 11.22% complete
Parsing 501 of 4018 units - 12.47% complete
Parsing 551 of 4018 units - 13.71% complete
Parsing 601 of 4018 units - 14.96% complete
Parsing 651 of 4018 units - 16.20% complete
Parsing 701 of 4018 units - 17.45% complete
Parsing 751 of 4018 units - 18.69% complete
Parsing 801 of 4018 units - 19.94% complete
Parsing 851 of 4018 units - 21.18% complete
Parsing 901 of 4018 units - 22.42% complete
Parsing 951 of 4018 units - 23.67% complete
Parsing 1001 of 4018 units - 24.91% complete
Parsing 1051 of 4018 units - 26.16% complete
Parsing 1101 of 4018 units - 27.40% comple

Dataset overview

In [112]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4017 entries, 0 to 4016
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       4017 non-null   object
 1   Model      3995 non-null   object
 2   Role       4017 non-null   object
 3   PV         4017 non-null   object
 4   Type       3917 non-null   object
 5   Size       4017 non-null   object
 6   Move       4017 non-null   object
 7   Short      4017 non-null   object
 8   Medium     4017 non-null   object
 9   Long       4017 non-null   object
 10  Overheat   4017 non-null   object
 11  Armor      4017 non-null   object
 12  Structure  4017 non-null   object
 13  Specials   3696 non-null   object
 14  ImageURL   4017 non-null   object
dtypes: object(15)
memory usage: 470.9+ KB


In [115]:
dataset.nunique()

Name         4017
Model        2832
Role            9
PV             71
Type            2
Size            5
Move           91
Short          14
Medium         11
Long            9
Overheat        5
Armor          21
Structure      11
Specials     1572
ImageURL      916
dtype: int64

Clear the dataset and make a new one (to have an access to initial data).
All the units with 0 PV can be dropped as they shouldn't be used during a game

In [134]:
unitlist = dataset[dataset['PV'] != 0].reset_index(drop=True)
unitlist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3916 entries, 0 to 3915
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       3916 non-null   object
 1   Model      3894 non-null   object
 2   Role       3916 non-null   object
 3   PV         3916 non-null   object
 4   Type       3916 non-null   object
 5   Size       3916 non-null   object
 6   Move       3916 non-null   object
 7   Short      3916 non-null   object
 8   Medium     3916 non-null   object
 9   Long       3916 non-null   object
 10  Overheat   3916 non-null   object
 11  Armor      3916 non-null   object
 12  Structure  3916 non-null   object
 13  Specials   3695 non-null   object
 14  ImageURL   3916 non-null   object
dtypes: object(15)
memory usage: 459.0+ KB


Check all NA values and replace them if applicable

In [131]:
unitlist[unitlist["Model"].isna()].head()

Unnamed: 0,Name,Model,Role,PV,Type,Size,Move,Short,Medium,Long,Overheat,Armor,Structure,Specials,ImageURL
170,Arbalest,,Missile Boat,26,BM,1,"10""",3,3,2,0,2,2,"CASE,ECM,IF1,LRM1/1/1",https://i.ibb.co/89B7WZX/arbalest-3085.png
273,Koshi (Standard),,Striker,30,BM,1,"14""/12""j",3,3,0,0,2,1,"CASE,JMPW1,LTAG,PRB,RCN,SRM2/2",https://i.ibb.co/4p5Q7X3/koshi-3150.png
482,Pack Hunter II,,Striker,34,BM,1,"14""j",3,3,2,0,3,2,CASE,https://i.ibb.co/9g5VR4J/pack-hunter-ii-3085.png
618,Cougar-XR,,Missile Boat,36,BM,1,"10""/14""j",3,3,2,0,4,2,"CASE,IF1,JMPS1,RFA",https://i.ibb.co/G9JBR33/cougar-xr.png
619,Eyrie,,Striker,35,BM,1,"12""/14""j",4,3,0,0,4,2,"CASE,JMPS1,MEL",https://i.ibb.co/C1Gs4sr/eyrie-3145.png


Model can be None

In [135]:
unitlist[unitlist["Specials"].isna()].head()

Unnamed: 0,Name,Model,Role,PV,Type,Size,Move,Short,Medium,Long,Overheat,Armor,Structure,Specials,ImageURL
13,Prey Seeker PY-SR30,PY-SR30,Scout,17,BM,1,"24""",1,0,0,0,2,1,,https://i.ibb.co/WkLHSR4/prey-seeker-3150.png
27,Cossack C-SK1,C-SK1,Striker,17,BM,1,"12""j",2,2,0,0,2,1,,https://i.ibb.co/jhmq1pX/cossack-3060.png
46,Fireball ALM-7D,ALM-7D,Scout,17,BM,1,"22""",1,0,0,0,2,1,,https://i.ibb.co/p0KNM8N/fireball-3055u.png
47,Fireball ALM-8D,ALM-8D,Scout,20,BM,1,"22""",2,1,0,0,2,1,,https://i.ibb.co/p0KNM8N/fireball-3055u.png
48,Fireball ALM-9D,ALM-9D,Scout,19,BM,1,"22""",1,1,0,0,2,1,,https://i.ibb.co/p0KNM8N/fireball-3055u.png


Specials can be None

Add information for each unit about its era availability

In [209]:
# Define the list of eras
eras = [
    {"Name": "Star League (2571 - 2780)", "ID": "star-league"},
    {"Name": "Early Succession War (2781 - 2900)", "ID": "early-succession-war"},
    {"Name": "Late Succession War - LosTech (2901 - 3019)", "ID": "late-succession-war---lostech"},
    {"Name": "Late Succession War - Renaissance (3020 - 3049)", "ID": "late-succession-war---renaissance"},
    {"Name": "Clan Invasion (3050 - 3061)", "ID": "clan-invasion"},
    {"Name": "Civil War (3062 - 3067)", "ID": "civil-war"},
    {"Name": "Jihad (3068 - 3085)", "ID": "jihad"},
    {"Name": "Early Republic (3086 - 3100)", "ID": "early-republic"},
    {"Name": "Late Republic (3101 - 3130)", "ID": "late-republic"},
    {"Name": "Dark Ages (3131 - 3150)", "ID": "dark-age"},
    {"Name": "ilClan (3151 - 9999)", "ID": "ilclan"}
]

# Create era availabilty dataset to fill later
era_av = pd.DataFrame(columns=[era["Name"] for era in eras])

Unnamed: 0,Star League (2571 - 2780),Early Succession War (2781 - 2900),Late Succession War - LosTech (2901 - 3019),Late Succession War - Renaissance (3020 - 3049),Clan Invasion (3050 - 3061),Civil War (3062 - 3067),Jihad (3068 - 3085),Early Republic (3086 - 3100),Late Republic (3101 - 3130),Dark Ages (3131 - 3150),ilClan (3151 - 9999)


Fill `era_av` with units' information

In [210]:
# Define the URL for all battlemechs
search_url = "http://www.masterunitlist.info/Unit/Filter?Types=18"

# Send an HTTP GET request to the URL
response = requests.get(search_url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all unit URLs
    unit_urls = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith("/Unit/Details/")]
    
    for i, unit_url in enumerate(unit_urls):
        if i % 50 == 0:
            percent_complete = (i + 1) / len(unit_urls) * 100
            print(f"Parsing {i + 1} of {len(unit_urls)} units - {percent_complete:.2f}% complete")

        
        unit_details_url = f"http://www.masterunitlist.info{unit_url}"
        unit_response = requests.get(unit_details_url)
        unit_soup = BeautifulSoup(unit_response.text, 'html.parser')

        unit_era = {'Name' : f'{unit_soup.find("h2").get_text().strip()}'}


        # Make a dict for each unit
        for era in eras:
            faction_era_element = unit_soup.find(id=era["ID"])
            if faction_era_element != None:
                factions = [a.get_text().strip() for a in faction_era_element.find_all("a")]
                unit_era[f"{era['Name']}"] = ", ".join(factions)
            else:
                unit_era[f"{era['Name']}"] = "Unknown"


        # Add unit eras to the dataframe
        era_av = pd.concat([era_av, pd.DataFrame([unit_era])], ignore_index=True)
        
        
print(f"Data has been saved to 'era_av' dataset")



Parsing 1 of 4018 units - 0.02% complete
Parsing 51 of 4018 units - 1.27% complete
Parsing 101 of 4018 units - 2.51% complete
Parsing 151 of 4018 units - 3.76% complete
Parsing 201 of 4018 units - 5.00% complete
Parsing 251 of 4018 units - 6.25% complete
Parsing 301 of 4018 units - 7.49% complete
Parsing 351 of 4018 units - 8.74% complete
Parsing 401 of 4018 units - 9.98% complete
Parsing 451 of 4018 units - 11.22% complete
Parsing 501 of 4018 units - 12.47% complete
Parsing 551 of 4018 units - 13.71% complete
Parsing 601 of 4018 units - 14.96% complete
Parsing 651 of 4018 units - 16.20% complete
Parsing 701 of 4018 units - 17.45% complete
Parsing 751 of 4018 units - 18.69% complete
Parsing 801 of 4018 units - 19.94% complete
Parsing 851 of 4018 units - 21.18% complete
Parsing 901 of 4018 units - 22.42% complete
Parsing 951 of 4018 units - 23.67% complete
Parsing 1001 of 4018 units - 24.91% complete
Parsing 1051 of 4018 units - 26.16% complete
Parsing 1101 of 4018 units - 27.40% comple

In [212]:
era_av.head()

Unnamed: 0,Star League (2571 - 2780),Early Succession War (2781 - 2900),Late Succession War - LosTech (2901 - 3019),Late Succession War - Renaissance (3020 - 3049),Clan Invasion (3050 - 3061),Civil War (3062 - 3067),Jihad (3068 - 3085),Early Republic (3086 - 3100),Late Republic (3101 - 3130),Dark Ages (3131 - 3150),ilClan (3151 - 9999),Name
0,Unknown,Unknown,Unknown,Unknown,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Cameroon
1,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Daemon
2,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Extinct,Extinct,Extinct,Extinct,Hatchetman HCT-3G
3,Unknown,Unknown,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Hermes II HER-2X
4,Unknown,Unknown,Unknown,Unknown,"Clan Jade Falcon, Unique",Extinct,Extinct,Extinct,Extinct,Extinct,Extinct,Phoenix Hawk LAM C


Join both datasets to get final one:

In [215]:
unitlist = unitlist.join(era_av.set_index('Name'), on='Name', how="left")
unitlist.head()

Unnamed: 0,Name,Model,Role,PV,Type,Size,Move,Short,Medium,Long,...,Early Succession War (2781 - 2900),Late Succession War - LosTech (2901 - 3019),Late Succession War - Renaissance (3020 - 3049),Clan Invasion (3050 - 3061),Civil War (3062 - 3067),Jihad (3068 - 3085),Early Republic (3086 - 3100),Late Republic (3101 - 3130),Dark Ages (3131 - 3150),ilClan (3151 - 9999)
0,Celerity CLR-02-X-D,CLR-02-X-D,Scout,15,BM,1,"40""",0,0,0,...,Unknown,Unknown,Unknown,ComStar,ComStar,ComStar,Extinct,Extinct,Extinct,Extinct
1,Celerity CLR-03-O,CLR-03-O,Scout,15,BM,1,"40""",0,0,0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary
2,Celerity CLR-03-OA,CLR-03-OA,Scout,16,BM,1,"40""",0,0,0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary
3,Celerity CLR-03-OB,CLR-03-OB,Scout,16,BM,1,"40""",0,0,0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary
4,Celerity CLR-03-OC,CLR-03-OC,Scout,16,BM,1,"40""",0,0,0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Republic of the Sphere,Mercenary


Write dataset into csv file:

In [220]:
path = f"unit_list_{timestamp}.csv"
unitlist.to_csv(path, index=False)