In [19]:
import pandas as pd
import re

In [20]:
#  get entire phone, tablet, watch directory
gsm_df = pd.read_json("../data/GSMArena.json")
print(gsm_df.head().to_string(index=False))

brand                 model                                  url
 Acer              Super ZX           acer_super_zx_5g-13796.php
 Acer Acerone Liquid S272E4 acer_acerone_liquid_s272e4-13757.php
 Acer Acerone Liquid S162E4 acer_acerone_liquid_s162e4-13756.php
 Acer     Chromebook Tab 10      acer_chromebook_tab_10-9139.php
 Acer         Iconia Talk S          acer_iconia_talk_s-8306.php


In [21]:
print(gsm_df.brand.unique())

['Acer' 'alcatel' 'Allview' 'Amazon' 'Amoi' 'Apple' 'Archos' 'Asus'
 'AT&amp;T' 'Benefon' 'BenQ' 'BenQ-Siemens' 'Bird' 'BlackBerry'
 'Blackview' 'BLU' 'Bosch' 'BQ' 'Casio' 'Cat' 'Celkon' 'Chea' 'Coolpad'
 'Cubot' 'Dell' 'Doogee' 'Emporia' 'Energizer' 'Ericsson' 'Eten'
 'Fairphone' 'Fujitsu Siemens' 'Garmin-Asus' 'Gigabyte' 'Gionee' 'Google'
 'Haier' 'HMD' 'Honor' 'HP' 'HTC' 'Huawei' 'i-mate' 'i-mobile' 'Icemobile'
 'Infinix' 'Innostream' 'iNQ' 'Intex' 'itel' 'Jolla' 'Karbonn' 'Kyocera'
 'Lava' 'LeEco' 'Lenovo' 'LG' 'Maxon' 'Maxwest' 'Meizu' 'Micromax'
 'Microsoft' 'Mitac' 'Mitsubishi' 'Modu' 'Motorola' 'MWg' 'NEC' 'Neonode'
 'NIU' 'Nokia' 'Nothing' 'Nvidia' 'O2' 'OnePlus' 'Oppo' 'Orange' 'Oscal'
 'Oukitel' 'Palm' 'Panasonic' 'Pantech' 'Parla' 'Philips' 'Plum' 'Posh'
 'Prestigio' 'QMobile' 'Qtek' 'Razer' 'Realme' 'Sagem' 'Samsung' 'Sendo'
 'Sewon' 'Sharp' 'Siemens' 'Sonim' 'Sony' 'Sony Ericsson' 'Spice'
 'T-Mobile' 'TCL' 'Tecno' 'Tel.Me.' 'Telit' 'Thuraya' 'Toshiba' 'Ulefone'
 'Umidigi'

# Focus
According to
[visualcapitalist](https://www.visualcapitalist.com/charted-americas-preferred-smartphone-brands/),
[internationalbrandequity](https://www.internationalbrandequity.com/best-mobile-phone-brands/), and
[manufacturingdigital](https://manufacturingdigital.com/top10/top-10-phone-manufacturers)
the most popular manifacturers were:
- Apple
- Samsung
- Motorola
- Google
- HTC
- Huawei
- Nokia
- OnePlus
- ZTE
- vivo
- Xiaomi
- Oppo
- Lenovo
- Sony
- Realme
- LG

Additional of Interest include:
- Honor
- Asus
- BlackBerry
- TCL

The Nothing Phone is also of interest, but due to the vague nature of the name and common usage,
it is omitted.


In [22]:
brand_subset = [
    "Apple", "Samsung", "Motorola","Google", "HTC",
    "Huawei", "Nokia", "OnePlus", "ZTE", "vivo",
    "Xiaomi", "Oppo", "Lenovo", "Sony", "Realme", "LG", 
    "Honor", "Asus", "BlackBerry", "TCL"
]

In [23]:
gsm_df = gsm_df.loc[gsm_df.brand.isin(brand_subset)]
gsm_df

Unnamed: 0,brand,model,url
745,Apple,iPad Air 13 (2025),apple_ipad_air_13_(2025)-13704.php
746,Apple,iPad Air 11 (2025),apple_ipad_air_11_(2025)-13703.php
747,Apple,iPad (2025),apple_ipad_(2025)-13702.php
748,Apple,iPhone 16e,apple_iphone_16e-13395.php
749,Apple,iPad mini (2024),apple_ipad_mini_(2024)-13437.php
...,...,...,...
12650,ZTE,F101,zte_f101-3101.php
12651,ZTE,F100,zte_f100-3100.php
12652,ZTE,Coral200 Sollar,zte_coral200_sollar-3213.php
12653,ZTE,Blade V20,zte_blade_v20-9913.php


In [24]:
print(gsm_df.groupby("brand")["model"].count())

brand
Apple         133
Asus          207
BlackBerry     92
Google         34
HTC           295
Honor         270
Huawei        487
LG            667
Lenovo        250
Motorola      650
Nokia         591
OnePlus        94
Oppo          350
Realme        256
Samsung       133
Sony          161
TCL            89
Xiaomi        462
ZTE           415
vivo          544
Name: model, dtype: int64


# Helper functions

In [25]:
def normalize_whitespace(x: str):
    return re.sub(r"\s+", " ", x).strip()

# Apple

In [26]:
apple_df = gsm_df[gsm_df["brand"] == "Apple"] \
    .copy() \
    .sort_values("model")

apple_filter = apple_df.model.apply(
    lambda x: re.search(r"(?i)(ipad|watch|CDMA)", x) is None
)
apple_df = apple_df.loc[apple_filter]

# remove year grouping - often not included in postings
apple_df.model = apple_df.model.apply(
    lambda x: re.sub(r"\(.*\)", "", x)
)

# after inspection, all the names are iphones, remove that when creating regex
apple_df.model = apple_df.model.apply(
    lambda x: re.sub(r"(?i)iphone", "", x)
)

# standardize the dataset
apple_df.model = apple_df.model.apply(normalize_whitespace)
apple_df = apple_df.loc[apple_df.model != ""]
apple_df = apple_df.drop_duplicates("model")

# craft regex
apple_df["pattern"] = r"\b(apple|iphone)\b"

print(apple_df.head().to_string(index=False))

brand      model                              url            pattern
Apple         11         apple_iphone_11-9848.php \b(apple|iphone)\b
Apple     11 Pro     apple_iphone_11_pro-9847.php \b(apple|iphone)\b
Apple 11 Pro Max apple_iphone_11_pro_max-9846.php \b(apple|iphone)\b
Apple         12        apple_iphone_12-10509.php \b(apple|iphone)\b
Apple     12 Pro    apple_iphone_12_pro-10508.php \b(apple|iphone)\b


# Samsung

In [27]:
sam_df = gsm_df[gsm_df["brand"] == "Samsung"] \
    .copy() \
    .sort_values("model")

sam_filter = sam_df.model.apply(
    lambda x: re.search(r"(?i)(watch|tab|sgh)", x) is None
)
sam_df = sam_df.loc[sam_filter]

# remove bracket information and band
sam_df.model = sam_df.model.apply(
    lambda x: re.sub(r"(?i)(4g|5g|\(.*\))", "", x)
)

# all smartphones are called galaxy
sam_df = sam_df.loc[
    sam_df.model.str.contains("galaxy", case=False)
]
sam_df.model = sam_df.model.str.replace("Galaxy", "")

# standardize the dataset
sam_df.model = sam_df.model.apply(normalize_whitespace)
sam_df = sam_df.drop_duplicates(["model"], keep="first")

# regex
sam_df["pattern"] = r"\b(samsung|galaxy)\b"

print(sam_df.head().to_string(index=False))

  brand    model                               url              pattern
Samsung      A04      samsung_galaxy_a04-11817.php \b(samsung|galaxy)\b
Samsung A04 Core samsung_galaxy_a04_core-11616.php \b(samsung|galaxy)\b
Samsung     A04e     samsung_galaxy_a04e-11945.php \b(samsung|galaxy)\b
Samsung     A04s     samsung_galaxy_a04s-11803.php \b(samsung|galaxy)\b
Samsung      A05      samsung_galaxy_a05-12583.php \b(samsung|galaxy)\b


# Google

In [28]:
google_df = gsm_df[gsm_df["brand"] == "Google"]\
    .copy()\
    .sort_values("model")

# filter out tablets and watches
google_filter = google_df.model.apply(
    lambda x: re.search(r"(?i)(watch|tablet)", x) is None
)
google_df = google_df.loc[google_filter]

# remove weird named tablet

# remove pixel
google_df.model = google_df.model.str.replace("Pixel", "")

google_df.model = google_df.model.apply(normalize_whitespace)

google_df.model = google_df.model.apply(lambda x: x if x != "" else "1")

google_df = google_df.loc[~google_df["model"].isin(["C"])]

google_df["pattern"] = r"\b(google|pixel)\b"

print(google_df.head().to_string(index=False))

 brand model                        url            pattern
Google     1      google_pixel-8346.php \b(google|pixel)\b
Google     2    google_pixel_2-8733.php \b(google|pixel)\b
Google  2 XL google_pixel_2_xl-8720.php \b(google|pixel)\b
Google     3    google_pixel_3-9256.php \b(google|pixel)\b
Google  3 XL google_pixel_3_xl-9257.php \b(google|pixel)\b


# Motorola

In [29]:
moto_df = gsm_df.loc[gsm_df["brand"] == "Motorola"] \
    .copy() \
    .sort_values("model")

# filter in or out specific groups
moto_keep = moto_df.model.apply(
    lambda x: re.search(r"\b(DROID|Edge|Moto|Nexus|One|Razr|ThinkPhone)\b", x) is not None
)
moto_discard = moto_df.model.apply(
    lambda x: re.search(r"(?i)(watch|pad|dual|tab|360|uw|xyboard)", x) is None
)
moto_df = moto_df.loc[moto_keep & moto_discard]

# replace 4G|5G|sub-groups
moto_df.model = moto_df.model.apply(
    lambda x: re.sub(r"(?i)(4G|5G|\(.*\))", "", x)
)
moto_df.model = moto_df.model.apply(
    lambda x: re.sub(r"20\d{2}", "", x)
)

moto_df.model = moto_df.model.str.replace("Moto", "")

# standardize
moto_df.model = moto_df.model.apply(normalize_whitespace)
moto_df = moto_df.drop_duplicates("model", keep="first")

moto_df["pattern"] = r"\b(motorola|moto)\b"

print(moto_df.head().to_string(index=False))

   brand              model                                  url             pattern
Motorola            DROID 2            motorola_droid_2-3475.php \b(motorola|moto)\b
Motorola     DROID 2 Global     motorola_droid_2_global-3636.php \b(motorola|moto)\b
Motorola            DROID 3            motorola_droid_3-4036.php \b(motorola|moto)\b
Motorola      DROID 4 XT894      motorola_droid_4_xt894-4418.php \b(motorola|moto)\b
Motorola DROID BIONIC XT865 motorola_droid_bionic_xt865-4523.php \b(motorola|moto)\b


# LG

In [30]:
lg_df = gsm_df[gsm_df["brand"] == "LG"] \
    .copy() \
    .sort_values("model")

# filter out tablets, watches, optimus, variants
lg_discard = lg_df.model.apply(
    lambda x: re.search(r"(?i)(pad|watch|optimus|uw|dual|lte)", x) is None
)
lg_discard2 = lg_df.model.apply(
    lambda x: re.search(r"\b\w\d{3,4}\b", x) is None
)
lg_df = lg_df.loc[lg_discard & lg_discard2]

# filter out thinQ, 5g, braces
lg_df.model = lg_df.model.apply(
    lambda x: re.sub(r"(?i)(\(.*\)|5g|thinQ)", "", x)
)

# extract series and models
lg_series = lg_df.model.apply(
    lambda x: re.search(r"\b(G|K|Q|V)\d*\b", x) is None
)
lg_models = lg_df.model.apply(
    lambda x: re.search(r"(?i)\b(nexus|stylo|stylus|velvet)\b", x) is None
)
lg_df = lg_df.loc[~lg_series | ~lg_models]

# standardize datasets
lg_df.model = lg_df.model.apply(normalize_whitespace)
lg_df = lg_df.drop_duplicates("model", keep="first")

lg_df["pattern"] = r"\blg\b"

print(lg_df.head().to_string(index=False))


brand      model                    url pattern
   LG     G Flex     lg_g_flex-5806.php  \blg\b
   LG    G Flex2    lg_g_flex2-6916.php  \blg\b
   LG    G Pro 2    lg_g_pro_2-6052.php  \blg\b
   LG G Pro Lite lg_g_pro_lite-5772.php  \blg\b
   LG    G Stylo    lg_g_stylo-7245.php  \blg\b


# OnePlus

In [31]:
one_df = gsm_df[gsm_df["brand"] == "OnePlus"] \
    .copy() \
    .sort_values("model")

# remove variants, watches, and tablets
one_filter = one_df.model.apply(
    lambda x: re.search(r"(?i)(watch|pad|uw)", x) is None
)
one_df = one_df.loc[one_filter]

# remove variant tags
one_df.model = one_df.model.apply(
    lambda x: re.sub(r"(?i)(5g|mclaren|racing|150w|\(.*\))", "", x)
)

# standardize
one_df.model = one_df.model.apply(normalize_whitespace)
one_df = one_df.drop_duplicates(subset=["model"])

one_df["pattern"] = r"\bone\s*plus\b"

print(one_df.head().to_string())

        brand   model                       url         pattern
7510  OnePlus  10 Pro  oneplus_10_pro-11234.php  \bone\s*plus\b
7507  OnePlus     10R     oneplus_10r-11445.php  \bone\s*plus\b
7502  OnePlus     10T     oneplus_10t-11622.php  \bone\s*plus\b
7497  OnePlus      11      oneplus_11-11893.php  \bone\s*plus\b
7495  OnePlus     11R     oneplus_11r-11915.php  \bone\s*plus\b


## Inspection
1. Based on the results
    - most search results return apple, samsung, google, motorola, lg, oneplus
2. Apple
    - people often write apple or iphone
        - for brand check have an alias for iphone as well
        - remove from model check
3. Samsung
    - people often write samsung or galaxy
        - alias samsung with galaxy
        - remove from model check
4. Google
    - google or pixel
5. Motorola
    - motorola or moto
6. Lg
    - pretty standard
7. Oneplus
    - alias as one plus

## General Notes
- many names are contain +, sub with plus
    - check names against plus not +

## Procedure
1. append title and description
2. replace + with plus
3. normalize the text a-z,0-9
4. match against brand and alias
7. if no brand is matched do a fuzzy match against brand acccept if over 90% certain
5. for each brand match check against models subset if available
6. if no models are found do a fuzzy match against the models
7. if nothing is found set as other


# TABLES
1. brand table
    - brand, pattern
2. model table
    - brand, model, url, pattern

In [32]:
model_df = pd.concat(
    [apple_df, sam_df, google_df, lg_df, one_df, moto_df]
)

In [33]:
# Brand table
brand_df = pd.DataFrame({"brand":[
    "HTC", "Huawei", "Nokia", "ZTE", "vivo", "Xiaomi", "Oppo",
    "Lenovo", "Sony", "Realme", "Honor", "Asus", "BlackBerry", "TCL"
]})

brand_df["pattern"] = brand_df.brand.apply(lambda x: r"\b" + x.lower() + r"\b")

brand_df = pd.concat([
    brand_df,
    model_df[["brand", "pattern"]].drop_duplicates(keep='first')
])

print(brand_df)

           brand               pattern
0            HTC               \bhtc\b
1         Huawei            \bhuawei\b
2          Nokia             \bnokia\b
3            ZTE               \bzte\b
4           vivo              \bvivo\b
5         Xiaomi            \bxiaomi\b
6           Oppo              \boppo\b
7         Lenovo            \blenovo\b
8           Sony              \bsony\b
9         Realme            \brealme\b
10         Honor             \bhonor\b
11          Asus              \basus\b
12    BlackBerry        \bblackberry\b
13           TCL               \btcl\b
806        Apple    \b(apple|iphone)\b
9258     Samsung  \b(samsung|galaxy)\b
2780      Google    \b(google|pixel)\b
5052          LG                \blg\b
7510     OnePlus        \bone\s*plus\b
6422    Motorola   \b(motorola|moto)\b


In [34]:
model_df.model = model_df.model.str.replace("+", " plus")
model_df.model = model_df.model.str.lower()
model_df.model = model_df.model.apply(normalize_whitespace)

model_df.pattern = model_df.model.apply(
    lambda x: r"\b" + re.sub(r"\s+", r"\\s+", x) + r"\b"
)

model_df["length"] = model_df.model.apply(lambda x: len(x))

print(model_df.head().to_string())

     brand       model                               url             pattern  length
806  Apple          11          apple_iphone_11-9848.php              \b11\b       2
805  Apple      11 pro      apple_iphone_11_pro-9847.php        \b11\s+pro\b       6
804  Apple  11 pro max  apple_iphone_11_pro_max-9846.php  \b11\s+pro\s+max\b      10
793  Apple          12         apple_iphone_12-10509.php              \b12\b       2
792  Apple      12 pro     apple_iphone_12_pro-10508.php        \b12\s+pro\b       6


In [35]:
model_df.to_csv("../data/cell_models.csv", index=False)
brand_df.to_csv("../data/cell_brand.csv", index=False)