Generate 2 .txt files for Male and Female names from https://vardai.vlkk.lt/

A second column provides the name's popularity from https://vardai.vlkk.lt/statistika/{name}

In [1]:
import unicodedata


preserve_characters = set("ąčęėįšųūž")

def normalize_name(name):
    name = name.lower()

    clean_name = ''.join(
        char if char in preserve_characters else
        unicodedata.normalize('NFD', char)[0] if unicodedata.category(char) != 'Mn' else ''
        for char in name
    )
    
    return clean_name.capitalize()

def full_normalized_name(name):
    name = name.lower()

    clean_name = ''.join(
        char for char in unicodedata.normalize('NFD', name) if (unicodedata.category(char) != 'Mn')
    )
    return clean_name.capitalize()

In [7]:
print(normalize_name("Šarauskas"))
print(normalize_name("ŽEMÝTĖ"))
print(normalize_name("PÉteris"))

print(full_normalized_name("Šarauskas"))
print(full_normalized_name("ŽEMÝTĖ"))
print(full_normalized_name("Pẽtrass"))

Šarauskas
Žemytė
Peteris
Sarauskas
Zemyte
Petrass


In [11]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd
import time

In [12]:
names_male = []
names_female = []
for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url = f'https://vardai.vlkk.lt/sarasas/{key}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links_male = soup.find_all('a', class_='names_list__links names_list__links--man')
    links_female = soup.find_all('a', class_='names_list__links names_list__links--woman')
    for name in links_male:
        normalized_name= normalize_name(name.text)
        names_male.append((normalized_name,'N/A'))
    for name in links_female:
        normalized_name= normalize_name(name.text)
        names_female.append((normalized_name,'N/A'))
    
np.savetxt('data/vardai_vyrai.txt', names_male, fmt='%s', header='name popularity', comments='', newline='\n',encoding="UTF-16")
np.savetxt('data/vardai_moterys.txt', names_female, fmt='%s', header='name popularity', comments='', newline='\n',encoding="UTF-16")

In [None]:
names_men = pd.read_csv('data/vardai_vyrai.txt',encoding='UTF-16',delimiter=' ')
names_women = pd.read_csv('data/vardai_moterys.txt',encoding='UTF-16',delimiter=' ')
print(names_women)

          name  popularity
0          Abė         NaN
1     Abigailė         NaN
2        Abija         NaN
3       Abrilė         NaN
4       Achila         NaN
...        ...         ...
4230   Žydrunė         NaN
4231   Židrūnė         NaN
4232   Žiginta         NaN
4233   Žimantė         NaN
4234    Ževile         NaN

[4235 rows x 2 columns]


In [14]:
response = requests.get("https://vardai.vlkk.lt/statistika/Petras")
soup = BeautifulSoup(response.text,'html.parser')
pop_number = re.search(r"\d+",soup.find('span',class_='date').text)
print(pop_number)

<re.Match object; span=(9, 12), match='510'>


In [44]:
def get_popularity_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    html_area = soup.find('span', class_='date')     
    if html_area and html_area.text:
        pop_number = re.search(r"\d+", html_area.text)
        if pop_number:
            return pop_number.group(0)
        else:
            print(f"No popularity number found for {url}")
            return 0
    else:
        return None

In [None]:
def update_popularity(df,tag):
    df_size = df.shape[0]
    for index, row in df.iterrows():
        if pd.isna(row['popularity']):
            name = row['name']
            print(f"{index+1}/{df_size}: Fetching data for {name}...")
            # Stat page can be:
            # {name}, {name}_vyro, {name}_moters
            # {name}_neteiktinas, {name}_neteiktinas_vyro, {name}_neteiktinas_moters
            # {name}_vengtinas, {name}_vengtinas_vyro, {name}_vengtinas_moters
            # Majority of names are just {name}
            
            # Try case 1: {name}
            popularity_number = get_popularity_from_url(f"https://vardai.vlkk.lt/statistika/{full_normalized_name(name)}")
            if popularity_number:
                df.at[index,'popularity'] = popularity_number
            else:
                # Try case 2: {name} + tag
                special_popularity_number = get_popularity_from_url(f"https://vardai.vlkk.lt/statistika/{full_normalized_name(name)}"+tag)
                if special_popularity_number:
                    df.at[index,'popularity'] = special_popularity_number
                else:
                    # Case 3: {name}_neteiktinas or {name}_vengtinas
                    # Name is to be avoided, so we don't care about its stats
                    print(f"No stats found for {name}, defaulting to 0")
                    df.at[index,'popularity'] = 0
            # Rate limiting so website doesn't block us.
            time.sleep(0.1)
    return df

In [49]:
names_men = pd.read_csv('data/vardai_vyrai.txt',encoding='UTF-16',delimiter=' ')
names_women = pd.read_csv('data/vardai_moterys.txt',encoding='UTF-16',delimiter=' ')


In [50]:
df = update_popularity(names_men,'_vyro')
df.to_csv('data/pop_vardai_vyrai.txt', sep=' ', encoding='UTF-16', index=False)

1/7700: Fetching data for Abas...


  df.at[index,'popularity'] = popularity_number


2/7700: Fetching data for Abdijus...
3/7700: Fetching data for Abdonas...
4/7700: Fetching data for Abdula...
5/7700: Fetching data for Abelis...
6/7700: Fetching data for Abis...
7/7700: Fetching data for Abraomas...
8/7700: Fetching data for Abromas...
9/7700: Fetching data for Achilas...
10/7700: Fetching data for Achmedas...
11/7700: Fetching data for Adalbertas...
12/7700: Fetching data for Adamas...
13/7700: Fetching data for Adanas...
14/7700: Fetching data for Adas...
15/7700: Fetching data for Adauktas...
16/7700: Fetching data for Adeodatas...
17/7700: Fetching data for Adis...
18/7700: Fetching data for Adolfas...
19/7700: Fetching data for Adolfinas...
20/7700: Fetching data for Adolis...
21/7700: Fetching data for Adolius...
22/7700: Fetching data for Adomas...
23/7700: Fetching data for Adonis...
24/7700: Fetching data for Adrianas...
25/7700: Fetching data for Adrijonas...
26/7700: Fetching data for Adrijus...
27/7700: Fetching data for Adris...
28/7700: Fetching data fo

In [51]:
df = update_popularity(names_women,'_moters')
df.to_csv('data/pop_vardai_moterys.txt', sep=' ', encoding='UTF-16', index=False)

1/8470: Fetching data for Abė...


  df.at[index,'popularity'] = popularity_number


2/8470: Fetching data for Abigailė...
3/8470: Fetching data for Abija...
4/8470: Fetching data for Abrilė...
5/8470: Fetching data for Achila...
6/8470: Fetching data for Achmeda...
7/8470: Fetching data for Ada...
8/8470: Fetching data for Adalberta...
9/8470: Fetching data for Adalija...
10/8470: Fetching data for Adela...
11/8470: Fetching data for Adelaida...
12/8470: Fetching data for Adelė...
13/8470: Fetching data for Adelija...
14/8470: Fetching data for Adelina...
15/8470: Fetching data for Adė...
16/8470: Fetching data for Adilija...
17/8470: Fetching data for Adolė...
18/8470: Fetching data for Adolfa...
19/8470: Fetching data for Adolfina...
20/8470: Fetching data for Adona...
21/8470: Fetching data for Adonė...
22/8470: Fetching data for Adrė...
23/8470: Fetching data for Adriana...
24/8470: Fetching data for Adrija...
25/8470: Fetching data for Afanasija...
26/8470: Fetching data for Afija...
27/8470: Fetching data for Afrodita...
28/8470: Fetching data for Afroditė...
29