# Babies Names

**names** = original df

**names_per_year** = the df used for analysis

## - Dataframes created

**names_per_year** = list of 2500+ given names and its number per year since 1900 

**rare** = where names = _PRENOMS_RARES

**no_year** = where year=XXXX

**rare2** = where names where given less than 80 times per year

**still_given** = names at least given once the past 10 years

**still_given_pop** = names at least given once the past 10 years + a positive popularity (increasing)


## - Features to create

**Given_total** = the number of times the name was given

**Given_since_2013** = total count since 2013

**Given_yn** = if the name appears at least once since 2013

**Popularity** = % evolution since 2013

**Decade** = the decade the name was the most popular (1900-1910=1900s ; 1910-1920=1910s ; ... ; 2010-2022= since 2010)

**Lenght** = length of the names

**Neutral** = find same names given for gender=F and gender=M

**Hyphen** = names with hyphen (1=yes ; 0=no)

**Count** = how many years the name was given at least once



In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
#!pip install unidecode
from unidecode import unidecode

## Import and check the dataset

In [4]:
names = pd.read_csv('nat2022.csv', delimiter=";")

In [5]:
names

Unnamed: 0,sexe,preusuel,annais,nombre
0,1,_PRENOMS_RARES,1900,1249
1,1,_PRENOMS_RARES,1901,1342
2,1,_PRENOMS_RARES,1902,1330
3,1,_PRENOMS_RARES,1903,1286
4,1,_PRENOMS_RARES,1904,1430
...,...,...,...,...
703002,2,ZYNEB,2019,7
703003,2,ZYNEB,2020,8
703004,2,ZYNEB,2021,6
703005,2,ZYNEB,2022,4


In [6]:
names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 703007 entries, 0 to 703006
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   sexe      703007 non-null  int64 
 1   preusuel  703005 non-null  object
 2   annais    703007 non-null  object
 3   nombre    703007 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 21.5+ MB


In [7]:
names.sexe.value_counts()

sexe
2    381726
1    321281
Name: count, dtype: int64

In [8]:
names

Unnamed: 0,sexe,preusuel,annais,nombre
0,1,_PRENOMS_RARES,1900,1249
1,1,_PRENOMS_RARES,1901,1342
2,1,_PRENOMS_RARES,1902,1330
3,1,_PRENOMS_RARES,1903,1286
4,1,_PRENOMS_RARES,1904,1430
...,...,...,...,...
703002,2,ZYNEB,2019,7
703003,2,ZYNEB,2020,8
703004,2,ZYNEB,2021,6
703005,2,ZYNEB,2022,4


## Cleaning

In [9]:
names['sexe'].replace(1,'M',inplace=True)
names['sexe'].replace(2,'F',inplace=True)

In [10]:
names.rename(columns={"preusuel": "names", "annais": "year"}, inplace=True)
#names = names.drop(df[df['_PRENOMS_RARES']], axis=0)
#names = names.drop(df[df['annais']=='XXXX'], axis=0)

In [11]:
rare = names[names['names']=='_PRENOMS_RARES']
#rare

In [12]:
no_year = names[names['year']=='XXXX']
#no_year

In [13]:
names.drop(names.loc[names['names']=='_PRENOMS_RARES'].index, inplace=True)
names.drop(names.loc[names['year']=='XXXX'].index, inplace=True)

In [14]:
rare2 = names[names['nombre']<80]
#rare2 = name given less then 80 times

In [15]:
names.drop(names.loc[names['nombre']<80].index, inplace=True)

In [16]:
names.shape

(80633, 4)

In [17]:
names.names.value_counts(dropna=False)

names
CAMILLE      237
MARIE        186
CLAUDE       159
DOMINIQUE    152
CLÉMENT      123
            ... 
HAZEL          1
ANNELIESE      1
SILA           1
HINA           1
STEEVY         1
Name: count, Length: 2915, dtype: int64

In [18]:
# we have 2915 distinct names

## Building a new dataframe for the analysis

In [19]:
names_per_year = names.pivot_table(index=['names', 'sexe'], columns = ['year'], values = ['nombre']).reset_index()

In [20]:
names_per_year.head()

Unnamed: 0_level_0,names,sexe,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre
year,Unnamed: 1_level_1,Unnamed: 2_level_1,1900,1901,1902,1903,1904,1905,1906,1907,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AALIYAH,F,,,,,,,,,...,145.0,196.0,188.0,193.0,199.0,190.0,229.0,258.0,276.0,292.0
1,AARON,M,,,,,,,,,...,1907.0,2054.0,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0
2,ABBY,F,,,,,,,,,...,114.0,163.0,115.0,144.0,155.0,188.0,195.0,212.0,268.0,237.0
3,ABDALLAH,M,,,,,,,,,...,260.0,273.0,312.0,269.0,254.0,243.0,193.0,191.0,147.0,155.0
4,ABDEL,M,,,,,,,,,...,,80.0,,,,,,,,


In [21]:
names_per_year=names_per_year.fillna(0)

In [22]:
#to check
names[names['names']=='ABBY']

Unnamed: 0,sexe,names,year,nombre
321600,F,ABBY,2007,83
321601,F,ABBY,2008,92
321602,F,ABBY,2009,105
321603,F,ABBY,2010,116
321604,F,ABBY,2011,113
321605,F,ABBY,2012,156
321606,F,ABBY,2013,114
321607,F,ABBY,2014,163
321608,F,ABBY,2015,115
321609,F,ABBY,2016,144


In [23]:
names_per_year.columns = ['_'.join(str(s).strip() for s in col if s) for col in names_per_year.columns]

In [24]:
names_per_year.rename(columns=lambda x: x.replace('nombre_', ''), inplace=True)

In [25]:
names_per_year

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,145.0,196.0,188.0,193.0,199.0,190.0,229.0,258.0,276.0,292.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1907.0,2054.0,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,114.0,163.0,115.0,144.0,155.0,188.0,195.0,212.0,268.0,237.0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,260.0,273.0,312.0,269.0,254.0,243.0,193.0,191.0,147.0,155.0
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,ÉRIC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2950,ÉTHAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,196.0,200.0,193.0,217.0,175.0,193.0,167.0,170.0,115.0,145.0
2951,ÉVA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,161.0,185.0,166.0,206.0,185.0,216.0,218.0,215.0,203.0,197.0
2952,ÉVAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,89.0,80.0,0.0,86.0,86.0,80.0,100.0,113.0,81.0,0.0


In [26]:
names_per_year.shape

(2954, 125)

In [27]:
names_per_year.sexe.value_counts()

sexe
F    1665
M    1289
Name: count, dtype: int64

In [28]:
# create a column called given = 1=Yes or 0=No
# No if the name has not been given the last 10 years

In [29]:
names_per_year.columns.values

array(['names', 'sexe', '1900', '1901', '1902', '1903', '1904', '1905',
       '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913',
       '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
       '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929',
       '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937',
       '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945',
       '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953',
       '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
     

In [30]:
names_per_year['given_since2013'] = names_per_year[['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
       '2022']].sum(axis=1)
names_per_year['given_since2018'] = names_per_year[['2018', '2019', '2020', '2021','2022']].sum(axis=1)

In [31]:
names_per_year['given_2013_yn'] = np.where(names_per_year['given_since2013'] == 0, 0, 1)
names_per_year['given_2018_yn'] = np.where(names_per_year['given_since2018'] == 0, 0, 1)

In [32]:
names_per_year.head(10)

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,...,2017,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,199.0,190.0,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,155.0,188.0,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,243.0,193.0,191.0,147.0,155.0,2297.0,929.0,1,1
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,1,0
5,ABDELKADER,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
6,ABDELKRIM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
7,ABDELLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,162.0,0.0,1,0
8,ABDOUL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,85.0,92.0,95.0,97.0,369.0,369.0,1,1
9,ABDOULAYE,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,147.0,183.0,149.0,181.0,177.0,200.0,1566.0,890.0,1,1


In [33]:
#create a popularity column = a % of evolution the past 10 years

In [34]:
names_per_year['popularity_10y'] = (names_per_year['2022']-names_per_year[ '2013'])/names_per_year['2013']*100

In [35]:
names_per_year['popularity_5y'] = (names_per_year['2022']-names_per_year[ '2018'])/names_per_year['2018']*100

In [36]:
names_per_year

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,...,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1,101.379310,53.684211
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.063830
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.0,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,80.0,0.0,1,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,ÉRIC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,,
2950,ÉTHAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,167.0,170.0,115.0,145.0,1771.0,790.0,1,1,-26.020408,-24.870466
2951,ÉVA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,218.0,215.0,203.0,197.0,1952.0,1049.0,1,1,22.360248,-8.796296
2952,ÉVAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.0,113.0,81.0,0.0,715.0,374.0,1,1,-100.000000,-100.000000


In [37]:
#names_per_year.loc[names_per_year['given_yn']==0]

In [38]:
names_per_year.popularity_10y = names_per_year.popularity_10y.fillna('x')
names_per_year.popularity_5y = names_per_year.popularity_5y.fillna('x')

In [39]:
pd.set_option('display.max_rows', 10)

In [40]:
#names_per_year.popularity.sort_values()

In [41]:
# The highest popularity before inf is 1172 so let's take 9999 to define infinite
# -9999 means 

In [42]:
def clean_pop10(row):
    # Check if 'popularity' column contains 'x'
    if row['popularity_10y'] == 'x':
        if row['given_2013_yn'] == 1:
            return  -100
        
        else: 
            return -9999
    elif row['popularity_10y']> 1000000:
        return 9999
    else: 
        return row['popularity_10y']

In [43]:
def clean_pop5(row):
    # Check if 'popularity' column contains 'x'
    if row['popularity_5y'] == 'x':
        if row['given_2018_yn'] == 1:
            return  -100
        
        else: 
            return -9999
    elif row['popularity_5y']> 1000000:
        return 9999
    else: 
        return row['popularity_5y']

In [44]:
names_per_year.popularity_10y = names_per_year.apply(clean_pop10, axis=1)
names_per_year.popularity_5y = names_per_year.apply(clean_pop5, axis=1)

In [45]:
pd.set_option('display.max_columns', 30)

In [46]:
names_per_year

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,196.0,188.0,193.0,199.0,190.0,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1,101.379310,53.684211
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2054.0,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,163.0,115.0,144.0,155.0,188.0,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.063830
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,273.0,312.0,269.0,254.0,243.0,193.0,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,1,0,-100.000000,-9999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,ÉRIC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000
2950,ÉTHAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,200.0,193.0,217.0,175.0,193.0,167.0,170.0,115.0,145.0,1771.0,790.0,1,1,-26.020408,-24.870466
2951,ÉVA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,185.0,166.0,206.0,185.0,216.0,218.0,215.0,203.0,197.0,1952.0,1049.0,1,1,22.360248,-8.796296
2952,ÉVAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,86.0,86.0,80.0,100.0,113.0,81.0,0.0,715.0,374.0,1,1,-100.000000,-100.000000


In [None]:
# create a column lenght

In [47]:
names_per_year['lenght'] = names_per_year['names'].str.len()

In [48]:
names_per_year.head()

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2015,2016,2017,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,188.0,193.0,199.0,190.0,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1,101.37931,53.684211,7
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,115.0,144.0,155.0,188.0,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.06383,4
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,312.0,269.0,254.0,243.0,193.0,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,1,0,-100.0,-9999.0,5


In [49]:
# create a column hyphen

In [50]:
names_per_year['hyphen'] = np.where(names_per_year['names'].str.contains('-'), 1,0)

In [51]:
names_per_year.head()

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2016,2017,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.0,199.0,190.0,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1,101.37931,53.684211,7,0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,144.0,155.0,188.0,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.06383,4,0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,269.0,254.0,243.0,193.0,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,1,0,-100.0,-9999.0,5,0


In [52]:
names_per_year.loc[names_per_year['hyphen']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2016,2017,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen
221,ANNA-MARIA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,10,1
227,ANNE-CATHERINE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,14,1
228,ANNE-CHARLOTTE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,14,1
229,ANNE-CLAIRE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,11,1
230,ANNE-CÉCILE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2348,PIERRE-LOUIS,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,12,1
2349,PIERRE-MARIE,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,12,1
2350,PIERRE-OLIVIER,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,14,1
2351,PIERRE-YVES,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.0,-9999.0,11,1


In [53]:
# create a gender neutral column

In [54]:
both_gender = names.pivot_table(index=['names'], columns = ['sexe'], aggfunc="sum" )

In [55]:
both_gender.columns = ['_'.join(str(s).strip() for s in col if s) for col in both_gender.columns]

In [56]:
both_gender.rename(columns=lambda x: x.replace('nombre_', ''), inplace=True)

In [57]:
both_gender = both_gender.drop(['year_F','year_M'], axis=1)

In [58]:
both_gender.fillna(0)

Unnamed: 0_level_0,F,M
names,Unnamed: 1_level_1,Unnamed: 2_level_1
AALIYAH,2714.0,0.0
AARON,0.0,33494.0
ABBY,2456.0,0.0
ABDALLAH,0.0,3915.0
ABDEL,0.0,1430.0
...,...,...
ÉRIC,0.0,1738.0
ÉTHAN,0.0,2111.0
ÉVA,2246.0,0.0
ÉVAN,0.0,795.0


In [59]:
both_gender['F'] = np.where(both_gender['F']>0, 1,0)
both_gender['M'] = np.where(both_gender['M']>0, 1,0)
both_gender['neutral'] = both_gender['F'] + both_gender['M']

In [60]:
both_gender.value_counts()

F  M  neutral
1  0  1          1626
0  1  1          1250
1  1  2            39
Name: count, dtype: int64

In [61]:
both_gender = both_gender.drop(['F','M'], axis=1)

In [62]:
both_gender.neutral = both_gender['neutral'].replace(1, 0)
both_gender.neutral = both_gender['neutral'].replace(2, 1)

In [63]:
both_gender.value_counts()

neutral
0          2876
1            39
Name: count, dtype: int64

In [64]:
both_gender.head()

Unnamed: 0_level_0,neutral
names,Unnamed: 1_level_1
AALIYAH,0
AARON,0
ABBY,0
ABDALLAH,0
ABDEL,0


In [65]:
names_per_year = pd.merge(names_per_year, both_gender, how='left', on=["names"])

In [66]:
names_per_year.shape

(2954, 134)

In [67]:
names_per_year[names_per_year['neutral']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2017,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral
114,ALIX,F,85.0,97.0,104.0,112.0,100.0,96.0,96.0,94.0,82.0,89.0,0.0,0.0,0.0,...,857.0,888.0,1164.0,1401.0,1349.0,1182.0,10116.0,5984.0,1,1,68.376068,33.108108,4,0,1
115,ALIX,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,191.0,190.0,183.0,179.0,211.0,178.0,1813.0,941.0,1,1,11.949686,-6.315789,4,0,1
176,ANAEL,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,5,0,1
177,ANAEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,5,0,1
191,ANDREA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,140.0,134.0,118.0,97.0,105.0,123.0,1264.0,577.0,1,1,10.810811,-8.208955,6,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,YAEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,4,0,1
2839,YANNICK,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,7,0,1
2840,YANNICK,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,7,0,1
2851,YAËL,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,96.0,96.0,110.0,115.0,87.0,587.0,504.0,1,1,4.819277,-9.375000,4,0,1


In [68]:
# given_total : how many times in total since 1900

In [69]:
pd.set_option('display.max_rows', 10)

In [70]:
names_per_year.columns.values

array(['names', 'sexe', '1900', '1901', '1902', '1903', '1904', '1905',
       '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913',
       '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
       '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929',
       '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937',
       '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945',
       '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953',
       '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
     

In [71]:
names_per_year['given_total'] = names_per_year[['1900', '1901', '1902', '1903', '1904', '1905',
       '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913',
       '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
       '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929',
       '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937',
       '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945',
       '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953',
       '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022']].sum(axis=1)

In [72]:
names_per_year

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2018,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,190.0,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1,101.379310,53.684211,7,0,0,2714.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2250.0,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,188.0,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.063830,4,0,0,2456.0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,243.0,193.0,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,80.0,0.0,1,0,-100.000000,-9999.000000,5,0,0,1430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,ÉRIC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,4,0,0,1738.0
2950,ÉTHAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.0,167.0,170.0,115.0,145.0,1771.0,790.0,1,1,-26.020408,-24.870466,5,0,0,2111.0
2951,ÉVA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,216.0,218.0,215.0,203.0,197.0,1952.0,1049.0,1,1,22.360248,-8.796296,3,0,0,2246.0
2952,ÉVAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,100.0,113.0,81.0,0.0,715.0,374.0,1,1,-100.000000,-100.000000,4,0,0,795.0


In [73]:
#column count: total years the name was given at least once

In [74]:
years = names_per_year[['1900', '1901', '1902', '1903', '1904', '1905',
       '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913',
       '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
       '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929',
       '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937',
       '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945',
       '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953',
       '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022']]

In [75]:
#years

In [76]:
count=[]
for col in years:
    count = np.count_nonzero(years, axis=1)
   

In [77]:
count = pd.DataFrame(count)
count

Unnamed: 0,0
0,14
1,25
2,16
3,21
4,16
...,...
2949,15
2950,12
2951,12
2952,9


In [78]:
names_per_year = pd.concat([names_per_year, count], axis=1)

In [79]:
names_per_year.rename(columns={0: "count"}, inplace=True)

In [80]:
names_per_year.head()

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2019,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,229.0,258.0,276.0,292.0,2166.0,1245.0,1,1,101.37931,53.684211,7,0,0,2714.0,14
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2447.0,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0,25
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,195.0,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.06383,4,0,0,2456.0,16
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,193.0,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,80.0,0.0,1,0,-100.0,-9999.0,5,0,0,1430.0,16


In [81]:
#column top_names = create categories from the column

In [82]:
labels = ['Very Rare','Rare','Uncommon', 'Common', 'Popular', 'Very Popular']
names_per_year['category'] = pd.qcut(names_per_year['given_total'],6,labels=labels)

In [83]:
names_per_year['category'].value_counts()

category
Very Rare       493
Very Popular    493
Rare            492
Uncommon        492
Common          492
Popular         492
Name: count, dtype: int64

In [84]:
names_per_year.head()

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2020,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,258.0,276.0,292.0,2166.0,1245.0,1,1,101.37931,53.684211,7,0,0,2714.0,14,Uncommon
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2316.0,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0,25,Very Popular
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,212.0,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.06383,4,0,0,2456.0,16,Uncommon
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,191.0,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21,Common
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,80.0,0.0,1,0,-100.0,-9999.0,5,0,0,1430.0,16,Uncommon


In [85]:
#column decade = the decade where the name was the most popular

In [86]:
def get_decade(year):
    return str(year)[:3] + '0s'

# Extracting decade from column names
decades = {col: get_decade(col) for col in years.columns}

# Grouping columns by decade and summing their values
df_decades = years.groupby(decades, axis=1).sum()

# If you want to preserve the column names as the decade ranges
# df_decades.columns = [f'{decade}-{int(decade[:3])+9}' for decade in df_decades.columns]

# If you want to preserve the column names as simply the decade
df_decades.columns = df_decades.columns.str[:4] + 's'

# Displaying the resulting DataFrame
display(df_decades)

Unnamed: 0,1900s,1910s,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,2000s,2010s,2020s
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92.0,1796.0,826.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,177.0,5547.0,20670.0,7100.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,280.0,1459.0,717.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1042.0,2380.0,493.0
4,0.0,0.0,0.0,0.0,0.0,0.0,252.0,363.0,648.0,0.0,0.0,167.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,0.0,0.0,0.0,0.0,0.0,0.0,1220.0,518.0,0.0,0.0,0.0,0.0,0.0
2950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1681.0,430.0
2951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1631.0,615.0
2952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,601.0,194.0


In [87]:
df_decades['decade_popularity'] = df_decades.idxmax(axis=1)

In [88]:
df_decades

Unnamed: 0,1900s,1910s,1920s,1930s,1940s,1950s,1960s,1970s,1980s,1990s,2000s,2010s,2020s,decade_popularity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92.0,1796.0,826.0,2010s
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,177.0,5547.0,20670.0,7100.0,2010s
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,280.0,1459.0,717.0,2010s
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1042.0,2380.0,493.0,2010s
4,0.0,0.0,0.0,0.0,0.0,0.0,252.0,363.0,648.0,0.0,0.0,167.0,0.0,1980s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,0.0,0.0,0.0,0.0,0.0,0.0,1220.0,518.0,0.0,0.0,0.0,0.0,0.0,1960s
2950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1681.0,430.0,2010s
2951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1631.0,615.0,2010s
2952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,601.0,194.0,2010s


In [89]:
df_decades2 = df_decades['decade_popularity']

In [90]:
names_per_year = pd.concat([names_per_year, df_decades2], axis=1)

In [91]:
names_per_year.head()

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2021,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,276.0,292.0,2166.0,1245.0,1,1,101.37931,53.684211,7,0,0,2714.0,14,Uncommon,2010s
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2497.0,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0,25,Very Popular,2010s
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,268.0,237.0,1791.0,1100.0,1,1,107.894737,26.06383,4,0,0,2456.0,16,Uncommon,2010s
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,147.0,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21,Common,2010s
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,80.0,0.0,1,0,-100.0,-9999.0,5,0,0,1430.0,16,Uncommon,1980s


In [92]:
# column top10 : the number of years the name was in the top 10

In [93]:
find_top10 = names.pivot_table(index=['names'], columns = ['year'], values = ['nombre']).reset_index()

In [94]:
find_top10 = find_top10.fillna(0)

In [95]:
find_top10.head(60)

Unnamed: 0_level_0,names,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre,nombre
year,Unnamed: 1_level_1,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,AALIYAH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,92.0,136.0,145.0,175.0,145.0,196.0,188.0,193.0,199.0,190.0,229.0,258.0,276.0,292.0
1,AARON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1381.0,1355.0,1606.0,1766.0,1729.0,1907.0,2054.0,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0
2,ABBY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,92.0,105.0,116.0,113.0,156.0,114.0,163.0,115.0,144.0,155.0,188.0,195.0,212.0,268.0,237.0
3,ABDALLAH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,168.0,165.0,184.0,187.0,205.0,260.0,273.0,312.0,269.0,254.0,243.0,193.0,191.0,147.0,155.0
4,ABDEL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,87.0,0.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55,AIDA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,AIDAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,143.0,124.0,132.0,106.0,0.0,0.0,102.0,82.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0
57,AIDEN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,143.0,155.0,142.0,134.0,109.0,88.0,88.0,96.0,84.0
58,AIME,563.0,629.0,648.0,654.0,691.0,733.0,787.0,829.0,901.0,873.0,924.0,850.0,982.0,859.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
find_top10.columns = ['_'.join(str(s).strip() for s in col if s) for col in find_top10.columns]

In [103]:
find_top10

Unnamed: 0,names,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,top10_count
0,AALIYAH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,92.0,136.0,145.0,175.0,145.0,196.0,188.0,193.0,199.0,190.0,229.0,258.0,276.0,292.0,0
1,AARON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1355.0,1606.0,1766.0,1729.0,1907.0,2054.0,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0,0
2,ABBY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,105.0,116.0,113.0,156.0,114.0,163.0,115.0,144.0,155.0,188.0,195.0,212.0,268.0,237.0,0
3,ABDALLAH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,165.0,184.0,187.0,205.0,260.0,273.0,312.0,269.0,254.0,243.0,193.0,191.0,147.0,155.0,0
4,ABDEL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,87.0,0.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2910,ÉRIC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2911,ÉTHAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,137.0,203.0,196.0,200.0,193.0,217.0,175.0,193.0,167.0,170.0,115.0,145.0,0
2912,ÉVA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,149.0,145.0,161.0,185.0,166.0,206.0,185.0,216.0,218.0,215.0,203.0,197.0,0
2913,ÉVAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,80.0,89.0,80.0,0.0,86.0,86.0,80.0,100.0,113.0,81.0,0.0,0


In [98]:
find_top10.rename(columns=lambda x: x.replace('nombre_', ''), inplace=True)

In [104]:
# Calculate top 10 counts for each name
top10_counts = find_top10.iloc[:, 1:].apply(lambda row: sorted(row, reverse=True)[:10], axis=1)

# Count how many of those top counts are within 1990-2000
top10_count = top10_counts.apply(lambda x: sum(year >= 1900 and year <= 2022 for year in x))

# Add the 'Top3_count' column to the DataFrame
find_top10['top10_count'] = top10_count

In [105]:
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 30)

In [106]:
find_top10.loc[find_top10['top10_count']!= 0]

Unnamed: 0,names,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,top10_count
1,AARON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1355.0,1606.0,1766.0,1729.0,1907.0,2054.0,2156.0,2372.0,2383.0,2250.0,2447.0,2316.0,2497.0,2287.0,1
31,ADELINE,224.0,218.0,204.0,227.0,227.0,215.0,179.0,182.0,198.0,168.0,174.0,158.0,156.0,166.0,...,157.0,115.0,106.0,93.0,81.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
48,AGATHE,111.0,100.0,123.0,110.0,114.0,104.0,94.0,100.0,102.0,113.0,91.0,83.0,114.0,0.0,...,1206.0,1193.0,1310.0,1371.0,1380.0,1448.0,1573.0,1622.0,1787.0,1959.0,1912.0,1925.0,2020.0,1958.0,5
100,ALFRED,1553.0,1716.0,1731.0,1722.0,1717.0,1775.0,1852.0,1899.0,1915.0,1904.0,1994.0,1760.0,1893.0,1834.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
111,ALISON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,103.0,104.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2662,THIBAULT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,907.0,940.0,857.0,735.0,783.0,845.0,786.0,695.0,612.0,581.0,482.0,337.0,372.0,275.0,2
2679,TIAGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,754.0,851.0,866.0,1090.0,1162.0,1403.0,1430.0,1294.0,1331.0,1580.0,2704.0,1972.0,1658.0,1383.0,1
2762,VIVIANE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2802,YANNICK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


## Web Scraping

In [107]:
# nature names

In [108]:
from bs4 import BeautifulSoup
import requests

In [109]:
url = "https://www.pampers.fr/grossesse/prenoms-de-bebe/article/prenoms-de-fleurs"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find the div containing the flower names
flower_div = soup.select('#to-main-content > div > div > div > div > div > div > div > div > p  ')

In [110]:
#flower_div

In [111]:
names = []
data = []

for i, element in enumerate(flower_div):
        names = element.select('b')
      
        print(f"{i + 1}. {names}")
        
        data.append([names])

flower_names = pd.DataFrame(data, columns=['names'])

1. []
2. [<b>Acacia.</b>]
3. [<b>Alysse.</b>]
4. [<b>Amande.</b>]
5. [<b>Amaryllis.</b>]
6. [<b>Anémone.</b>]
7. [<b>Astrée</b>]
8. [<b>Azalée</b>]
9. [<b>Cassia</b>]
10. [<b>Clémentine</b>]
11. [<b>Cynthia</b>]
12. [<b>Dahlia</b>]
13. [<b>Églantine</b>]
14. [<b>Flora</b>]
15. [<b>Flore</b>]
16. [<b>Fleur</b>]
17. [<b>Fleurette</b>]
18. [<b>Garance</b>]
19. [<b>Gentiane</b>]
20. [<b>Hortense</b>]
21. [<b>Hyacinthe</b>]
22. [<b>Jasmine</b>]
23. [<b>Lila</b>]
24. [<b>Lily</b>]
25. [<b>Liliane</b>]
26. [<b>Lys</b>]
27. [<b>Marguerite</b>]
28. [<b>Marjolaine</b>]
29. [<b>Muguette</b>]
30. [<b>Narcisse</b>]
31. [<b>Olive</b>]
32. [<b>Pâquerette</b>]
33. [<b>Pimprenelle</b>]
34. [<b>Pomme</b>]
35. [<b>Praline</b>]
36. [<b>Samantha</b>]
37. [<b>Saule</b>]
38. [<b>Suzanne</b>]
39. [<b>Véronique</b>]
40. [<b>Yolande</b>]
41. [<b>Rose</b>]
42. [<b>Rosalie</b>]
43. [<b>Rosanne</b>]
44. [<b>Roselyne</b>]
45. [<b>Rosemarie</b>]
46. [<b>Rosemonde</b>]
47. [<b>Rosette</b>]
48. [<b>Rosie</b>]
49. [<b>

In [112]:
flower_names = []

for paragraph in flower_div:
    
    bold_tags = paragraph.find_all('b')
    for bold_tag in bold_tags:
        flower_name = bold_tag.text.strip()
        # Remove the dot if it exists at the end of the flower name
        flower_name = flower_name.rstrip('.')
        flower_names.append(flower_name)
    
# Create a DataFrame with the flower names
flower_names = pd.DataFrame(flower_names, columns=['names'])

In [113]:
flower_names

Unnamed: 0,names
0,Acacia
1,Alysse
2,Amande
3,Amaryllis
4,Anémone
...,...
79,Tulip
80,Willow
81,Zahara
82,Zaynab


In [114]:
flower_names= flower_names.drop([83], axis=0)

In [115]:
flower_names['names'] = flower_names['names'].str.upper()

In [116]:
flower_names.head()

Unnamed: 0,names
0,ACACIA
1,ALYSSE
2,AMANDE
3,AMARYLLIS
4,ANÉMONE


In [117]:
def dealing_accent(df):
    df2 = df.copy()
    stringList = df.names
    
    for i in range(len(stringList)):
        stringList[i] = unidecode(stringList[i])

    df3 = pd.concat([df2, df], axis=0)
    df3 = df3.drop_duplicates(subset=['names'])
    
    return df3

In [118]:
flower_names_result = dealing_accent(flower_names)
flower_names_result.shape

(94, 1)

In [119]:
flower_names_result

Unnamed: 0,names
0,ACACIA
1,ALYSSE
2,AMANDE
3,AMARYLLIS
4,ANÉMONE
...,...
37,VERONIQUE
52,PENSEE
56,BEGONIA
57,CAMELIA


In [120]:
def adding_to_df(df1, df2, col):
    df1[col] = 1
    df2 = pd.merge(df2, df1, how='left', on=["names"])
    df2[col] = df2[col].fillna(0)
    
    return df2

In [121]:
names_per_year = adding_to_df(flower_names_result, names_per_year, 'flower_names')


In [122]:
names_per_year

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,292.0,2166.0,1245.0,1,1,101.379310,53.684211,7,0,0,2714.0,14,Uncommon,2010s,0.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0,25,Very Popular,2010s,0.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,237.0,1791.0,1100.0,1,1,107.894737,26.063830,4,0,0,2456.0,16,Uncommon,2010s,0.0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21,Common,2010s,0.0
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,80.0,0.0,1,0,-100.000000,-9999.000000,5,0,0,1430.0,16,Uncommon,1980s,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2949,ÉRIC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,4,0,0,1738.0,15,Uncommon,1960s,0.0
2950,ÉTHAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,145.0,1771.0,790.0,1,1,-26.020408,-24.870466,5,0,0,2111.0,12,Uncommon,2010s,0.0
2951,ÉVA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,197.0,1952.0,1049.0,1,1,22.360248,-8.796296,3,0,0,2246.0,12,Uncommon,2010s,0.0
2952,ÉVAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,715.0,374.0,1,1,-100.000000,-100.000000,4,0,0,795.0,9,Rare,2010s,0.0


In [123]:
names_per_year.head()

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,292.0,2166.0,1245.0,1,1,101.37931,53.684211,7,0,0,2714.0,14,Uncommon,2010s,0.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2287.0,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0,25,Very Popular,2010s,0.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,237.0,1791.0,1100.0,1,1,107.894737,26.06383,4,0,0,2456.0,16,Uncommon,2010s,0.0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,155.0,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21,Common,2010s,0.0
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,80.0,0.0,1,0,-100.0,-9999.0,5,0,0,1430.0,16,Uncommon,1980s,0.0


In [124]:
names_per_year.loc[names_per_year['flower_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,2022,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names
419,CAMELIA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,382.0,0.0,1,0,-100.000000,-9999.000000,7,0,0,861.0,6,Rare,2010s,1.0
426,CAMÉLIA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,393.0,3510.0,1924.0,1,1,30.132450,14.912281,7,0,0,8311.0,30,Popular,2010s,1.0
467,CERISE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,6,0,0,343.0,3,Very Rare,2000s,1.0
531,CLEMENTINE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,10,0,0,519.0,3,Rare,2010s,1.0
543,CLÉMENTINE,F,529.0,571.0,524.0,530.0,507.0,511.0,492.0,447.0,452.0,445.0,434.0,380.0,401.0,...,169.0,3309.0,1210.0,1,1,-63.811563,-45.483871,10,0,0,36046.0,80,Very Popular,1990s,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2793,VIOLETTE,F,0.0,0.0,0.0,0.0,83.0,80.0,94.0,108.0,156.0,135.0,107.0,158.0,165.0,...,223.0,2452.0,1095.0,1,1,-39.729730,11.500000,8,0,0,22151.0,102,Popular,2010s,1.0
2801,VÉRONIQUE,F,0.0,92.0,0.0,86.0,80.0,81.0,83.0,85.0,0.0,89.0,81.0,0.0,80.0,...,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,9,0,0,233459.0,69,Very Popular,1960s,1.0
2818,WILLOW,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,102.0,195.0,195.0,1,1,9999.000000,9999.000000,6,0,0,195.0,2,Very Rare,2020s,1.0
2865,YOLANDE,F,0.0,90.0,87.0,87.0,133.0,137.0,112.0,165.0,189.0,198.0,278.0,277.0,296.0,...,0.0,0.0,0.0,0,0,-9999.000000,-9999.000000,7,0,0,53049.0,78,Very Popular,1950s,1.0


In [125]:
# origin names
#1 arabic names

In [126]:
arab_names = []
for i in range(1,5):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:56/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
            # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        arab_names.append(name)

# Create a DataFrame with the names
arabic_names = pd.DataFrame(arab_names, columns=['names'])
print(arabic_names.shape)
arabic_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
(544, 1)


Unnamed: 0,names
0,INÈS
1,KAYLA
2,NORA
3,AHMED
4,MANEL


In [127]:
arabic_names_result = dealing_accent(arabic_names)
arabic_names_result.shape

(545, 1)

In [128]:
names_per_year = adding_to_df(arabic_names_result, names_per_year, 'arabic_names')

In [129]:
pd.set_option('display.max_rows', 100)

In [130]:
# just add the name I am sure they are arabic but were not in the scraping list
names_per_year.loc[names_per_year['names'] == 'AALIYAH', ['arabic_names']] = 1
names_per_year.loc[names_per_year['names'].str.startswith('ABD'), ['arabic_names']] = 1

In [131]:
names_per_year.loc[names_per_year['arabic_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2166.0,1245.0,1,1,101.379310,53.684211,7,0,0,2714.0,14,Uncommon,2010s,0.0,1.0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21,Common,2010s,0.0,1.0
4,ABDEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,80.0,0.0,1,0,-100.000000,-9999.000000,5,0,0,1430.0,16,Uncommon,1980s,0.0,1.0
5,ABDELKADER,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,-9999.000000,-9999.000000,10,0,0,8824.0,49,Popular,1970s,0.0,1.0
6,ABDELKRIM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,-9999.000000,-9999.000000,9,0,0,853.0,10,Rare,1960s,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2896,ZAKARIA,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5293.0,2521.0,1,1,3.921569,2.119461,7,0,0,12811.0,39,Popular,2010s,0.0,1.0
2902,ZAYNAB,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1078.0,643.0,1,1,9999.000000,28.571429,6,0,0,1078.0,9,Rare,2010s,1.0,1.0
2912,ZINEB,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,629.0,84.0,1,1,-100.000000,-100.000000,5,0,0,1459.0,14,Uncommon,2010s,0.0,1.0
2913,ZINEDINE,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0,-9999.000000,-9999.000000,8,0,0,1365.0,12,Uncommon,2000s,0.0,1.0


In [132]:
# origin names
#2 greek names

In [133]:
greek_names = []
for i in range(1,7):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:30/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
            # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        greek_names.append(name)

# Create a DataFrame with the names
greek_names = pd.DataFrame(greek_names, columns=['names'])
print(greek_names.shape)
greek_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
(817, 1)


Unnamed: 0,names
0,EUGENE
1,MÉLANIE
2,DOROTHY
3,ANGELINA
4,DORIS


In [134]:
greek_names_result = dealing_accent(greek_names)
greek_names_result.shape

(829, 1)

In [135]:
names_per_year = adding_to_df(greek_names_result, names_per_year, 'greek_names')

In [136]:
names_per_year.loc[names_per_year['greek_names'] == 1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names
48,AGATHE,F,111.0,100.0,123.0,110.0,114.0,104.0,94.0,100.0,102.0,113.0,91.0,83.0,114.0,...,9774.0,1,1,41.884058,-0.051046,6,0,0,54441.0,74,Very Popular,2010s,0.0,0.0,1.0
50,AGNÈS,F,422.0,433.0,464.0,448.0,494.0,497.0,472.0,502.0,547.0,552.0,538.0,572.0,568.0,...,458.0,1,1,9999.000000,-2.298851,5,0,0,112470.0,116,Very Popular,1960s,0.0,0.0,1.0
62,AISSA,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,-9999.000000,-9999.000000,5,0,0,284.0,3,Very Rare,2000s,0.0,0.0,1.0
84,ALESSANDRO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,593.0,1,1,-28.143713,0.000000,10,0,0,2913.0,21,Common,2010s,0.0,0.0,1.0
87,ALEX,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1341.0,1,1,-55.605381,-40.718563,4,0,0,30342.0,83,Very Popular,2000s,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2767,ULYSSE,M,82.0,94.0,81.0,93.0,97.0,85.0,91.0,84.0,88.0,91.0,92.0,84.0,86.0,...,2301.0,1,1,50.825083,18.087855,6,0,0,9352.0,46,Popular,2010s,0.0,0.0,1.0
2801,VÉRONIQUE,F,0.0,92.0,0.0,86.0,80.0,81.0,83.0,85.0,0.0,89.0,81.0,0.0,80.0,...,0.0,0,0,-9999.000000,-9999.000000,9,0,0,233459.0,69,Very Popular,1960s,1.0,0.0,1.0
2874,YOURI,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,83.0,1,1,-100.000000,-100.000000,5,0,0,886.0,9,Rare,1990s,0.0,0.0,1.0
2915,ZOE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1,0,-100.000000,-9999.000000,3,0,0,4603.0,8,Common,2010s,0.0,0.0,1.0


In [137]:
# origin names
#3 american names

In [138]:
american_names = []
for i in range(1,12):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:52/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
            # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        american_names.append(name)

# Create a DataFrame with the names
american_names = pd.DataFrame(american_names, columns=['names'])
print(american_names.shape)
american_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
(1474, 1)


Unnamed: 0,names
0,ALEXIS
1,CLARENCE
2,ERIK
3,JAMIE
4,NORMAN


In [139]:
american_names_result = dealing_accent(american_names)
american_names_result.shape

(1351, 1)

In [140]:
american_names_result.names.sort_values()

1360     AALIYAH
107        AARON
1389       ABBIE
1416        ABBY
754     ABDULLAH
          ...   
1283        ZION
475          ZOE
746         ZOEY
475          ZOÉ
1336        ÉDEN
Name: names, Length: 1351, dtype: object

In [141]:
names_per_year = adding_to_df(american_names_result, names_per_year, 'american_names')

In [142]:
names_per_year.loc[names_per_year['american_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,101.379310,53.684211,7,0,0,2714.0,14,Uncommon,2010s,0.0,1.0,0.0,1.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,19.926586,1.644444,5,0,0,33494.0,25,Very Popular,2010s,0.0,0.0,0.0,1.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,107.894737,26.063830,4,0,0,2456.0,16,Uncommon,2010s,0.0,0.0,0.0,1.0
10,ABDULLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,-100.000000,-100.000000,8,0,0,162.0,2,Very Rare,2010s,0.0,1.0,0.0,1.0
11,ABEL,M,428.0,506.0,519.0,499.0,476.0,522.0,510.0,538.0,522.0,572.0,524.0,490.0,503.0,...,1,1,54.503464,20.976492,4,0,0,25221.0,76,Popular,1900s,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2893,ZACHARY,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,-100.000000,-100.000000,7,0,0,1685.0,17,Uncommon,2010s,0.0,0.0,0.0,1.0
2894,ZACK,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,77.310924,45.517241,4,0,0,1760.0,12,Uncommon,2010s,0.0,0.0,0.0,1.0
2915,ZOE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,-100.000000,-9999.000000,3,0,0,4603.0,8,Common,2010s,0.0,0.0,1.0,1.0
2918,ZOÉ,F,131.0,146.0,138.0,158.0,124.0,130.0,113.0,105.0,109.0,123.0,104.0,95.0,81.0,...,1,1,-50.423177,-24.416873,3,0,0,63345.0,53,Very Popular,2000s,0.0,0.0,1.0,1.0


In [143]:
# origin names
#4 latin names

In [144]:
latin_names = []
for i in range(1,6):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:61/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
            # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        latin_names.append(name)

# Create a DataFrame with the names
latin_names = pd.DataFrame(latin_names, columns=['names'])
print(latin_names.shape)
latin_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
(606, 1)


Unnamed: 0,names
0,JEAN-CLAUDE
1,CHRISTIAN
2,DIDIER
3,OLIVIER
4,LEONA


In [145]:
latin_names_result = dealing_accent(latin_names)
latin_names_result.shape

(613, 1)

In [146]:
names_per_year = adding_to_df(latin_names_result, names_per_year, 'latin_names')

In [147]:
names_per_year.loc[names_per_year['latin_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names
62,AISSA,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-9999.000000,-9999.000000,5,0,0,284.0,3,Very Rare,2000s,0.0,0.0,1.0,0.0,1.0
71,ALAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,-50.988142,-31.491713,4,0,0,20326.0,50,Popular,1990s,0.0,0.0,0.0,1.0,1.0
72,ALANA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,25.842697,-13.846154,5,0,0,2045.0,17,Uncommon,2010s,0.0,0.0,0.0,0.0,1.0
108,ALIETTE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-9999.000000,-9999.000000,7,0,0,435.0,5,Rare,1940s,0.0,0.0,0.0,0.0,1.0
122,ALLAIN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-9999.000000,-9999.000000,6,0,0,1580.0,14,Uncommon,1940s,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2788,VIKTOR,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,-100.000000,-100.000000,6,0,0,608.0,7,Rare,2010s,0.0,0.0,0.0,0.0,1.0
2789,VINCENT,M,271.0,261.0,280.0,266.0,307.0,342.0,345.0,287.0,291.0,325.0,316.0,340.0,333.0,...,1,-100.000000,-100.000000,7,0,0,232335.0,121,Very Popular,1980s,0.0,0.0,0.0,1.0,1.0
2794,VIRGIL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-9999.000000,-9999.000000,6,0,0,86.0,1,Very Rare,2000s,0.0,0.0,0.0,0.0,1.0
2795,VIRGILE,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,-100.000000,-9999.000000,7,0,0,6101.0,41,Common,2000s,0.0,0.0,0.0,0.0,1.0


In [148]:
# origin names
#5 french names

In [149]:
french_names = []
for i in range(1,5):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:38/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        french_names.append(name)

# Create a DataFrame with the names
french_names = pd.DataFrame(french_names, columns=['names'])
print(french_names.shape)
french_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
(470, 1)


Unnamed: 0,names
0,IDA
1,SIMONE
2,CHRISTIAN
3,LEO
4,YVONNE


In [150]:
french_names_result = dealing_accent(french_names)
french_names_result.shape

(491, 1)

In [151]:
names_per_year = adding_to_df(french_names_result, names_per_year, 'french_names')

In [152]:
names_per_year.loc[names_per_year['french_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names
11,ABEL,M,428.0,506.0,519.0,499.0,476.0,522.0,510.0,538.0,522.0,572.0,524.0,490.0,503.0,...,54.503464,20.976492,4,0,0,25221.0,76,Popular,1900s,0.0,0.0,0.0,1.0,0.0,1.0
28,ADELAIDE,F,194.0,188.0,172.0,177.0,166.0,152.0,170.0,159.0,146.0,143.0,146.0,147.0,142.0,...,-9999.000000,-9999.000000,8,0,0,7299.0,48,Common,1980s,0.0,0.0,0.0,0.0,0.0,1.0
29,ADELE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.000000,-9999.000000,5,0,0,1492.0,7,Uncommon,2010s,0.0,0.0,0.0,0.0,0.0,1.0
31,ADELINE,F,224.0,218.0,204.0,227.0,227.0,215.0,179.0,182.0,198.0,168.0,174.0,158.0,156.0,...,-100.000000,-9999.000000,7,0,0,52483.0,85,Very Popular,1980s,0.0,0.0,0.0,1.0,0.0,1.0
42,ADRIEN,M,848.0,926.0,972.0,987.0,1018.0,981.0,1034.0,1034.0,1058.0,1048.0,1088.0,952.0,1108.0,...,-53.345900,-37.022901,6,0,0,137757.0,116,Very Popular,1990s,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888,YVETTE,F,0.0,98.0,156.0,179.0,241.0,320.0,360.0,427.0,498.0,611.0,722.0,881.0,1097.0,...,-9999.000000,-9999.000000,6,0,0,202783.0,73,Very Popular,1930s,0.0,0.0,0.0,1.0,0.0,1.0
2889,YVON,M,0.0,0.0,0.0,0.0,0.0,85.0,94.0,93.0,143.0,136.0,135.0,150.0,191.0,...,-9999.000000,-9999.000000,4,0,0,38992.0,70,Very Popular,1950s,0.0,0.0,0.0,0.0,0.0,1.0
2890,YVONNE,F,5096.0,5715.0,5930.0,6085.0,6771.0,6929.0,6973.0,7142.0,7428.0,7359.0,7663.0,7407.0,7848.0,...,-9999.000000,-9999.000000,6,0,0,256118.0,69,Very Popular,1920s,0.0,0.0,0.0,0.0,0.0,1.0
2915,ZOE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.000000,-9999.000000,3,0,0,4603.0,8,Common,2010s,0.0,0.0,1.0,1.0,0.0,1.0


In [153]:
# origin names
#6 italian names

In [154]:
italian_names = []
for i in range(1,2):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:19/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        italian_names.append(name)

# Create a DataFrame with the names
italian_names = pd.DataFrame(italian_names, columns=['names'])
print(italian_names.shape)
italian_names.head()

Status code: 200
(129, 1)


Unnamed: 0,names
0,STELLA
1,VIRGINIA
2,ANDREA
3,ANGELINA
4,ANTONIO


In [155]:
italian_names_result = dealing_accent(italian_names)
italian_names_result.shape

(129, 1)

In [156]:
pd.set_option('display.max_rows', 10)

In [157]:
names_per_year = adding_to_df(italian_names_result, names_per_year, 'italian_names')


In [158]:
names_per_year.loc[names_per_year['italian_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names
21,ADA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9999.000000,3,0,0,84.0,1,Very Rare,2020s,0.0,0.0,0.0,0.0,0.0,0.0,1.0
40,ADRIANO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.000000,7,0,0,935.0,10,Rare,2010s,0.0,0.0,0.0,0.0,0.0,0.0,1.0
86,ALESSIO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,110.902256,7,0,0,7610.0,19,Popular,2010s,0.0,0.0,0.0,0.0,0.0,0.0,1.0
106,ALIDA,F,0.0,0.0,0.0,96.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-9999.000000,5,0,0,180.0,2,Very Rare,1900s,0.0,0.0,0.0,0.0,0.0,0.0,1.0
127,ALONZO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-9999.000000,6,0,0,80.0,1,Very Rare,2010s,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2726,TIMEO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.000000,5,0,0,16228.0,17,Popular,2000s,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2772,VALENTINA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,120.000000,9,0,0,3771.0,11,Common,2010s,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2774,VALENTINO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.370787,9,0,0,793.0,9,Rare,2010s,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2790,VINCENZO,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-9999.000000,8,0,0,168.0,2,Very Rare,1960s,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [159]:
# origin names
#7 celt names

In [160]:
celt_names = []
for i in range(1,5):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:54/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        celt_names.append(name)

# Create a DataFrame with the names
celt_names = pd.DataFrame(celt_names, columns=['names'])
print(celt_names.shape)
celt_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
(433, 1)


Unnamed: 0,names
0,AUDREY
1,KELLY
2,NOLAN
3,JENNA
4,CLYDE


In [161]:
celt_names_result = dealing_accent(celt_names)
celt_names_result.shape

(433, 1)

In [162]:
names_per_year = adding_to_df(celt_names_result, names_per_year, 'celt_names')

In [163]:
names_per_year.loc[names_per_year['celt_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names
13,ABIGAELLE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9,0,0,539.0,5,Rare,2000s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
47,AEDAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5,0,0,338.0,3,Very Rare,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
56,AIDAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5,0,0,1007.0,9,Rare,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
69,ALAIN,M,97.0,118.0,125.0,138.0,151.0,152.0,145.0,167.0,162.0,160.0,211.0,188.0,195.0,...,5,0,0,506203.0,105,Very Popular,1950s,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
71,ALAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,0,0,20326.0,50,Popular,1990s,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2749,TRISTAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7,0,0,41495.0,54,Very Popular,2000s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2750,TRYSTAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7,0,0,363.0,4,Very Rare,2000s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2839,YANNICK,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7,0,1,4265.0,29,Common,1960s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2840,YANNICK,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7,0,1,85533.0,68,Very Popular,1970s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [164]:
# origin names
#8 english names

In [165]:
english_names = []
for i in range(1,7):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:69/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        english_names.append(name)

# Create a DataFrame with the names
english_names = pd.DataFrame(english_names, columns=['names'])
print(english_names.shape)
english_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
(697, 1)


Unnamed: 0,names
0,AVERY
1,ISAAC
2,RILEY
3,ELLA
4,STEVEN


In [166]:
english_names_result = dealing_accent(english_names)
english_names_result.shape

(564, 1)

In [167]:
names_per_year = adding_to_df(english_names_result, names_per_year, 'english_names')

In [168]:
names_per_year.loc[names_per_year['english_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,33494.0,25,Very Popular,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,2456.0,16,Uncommon,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
14,ABIGAIL,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,286.0,3,Very Rare,2000s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
18,ABRAHAM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,91.0,1,Very Rare,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
28,ADELAIDE,F,194.0,188.0,172.0,177.0,166.0,152.0,170.0,159.0,146.0,143.0,146.0,147.0,142.0,...,0,0,7299.0,48,Common,1980s,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2893,ZACHARY,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1685.0,17,Uncommon,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2894,ZACK,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1760.0,12,Uncommon,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2901,ZAYN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,2570.0,7,Uncommon,2020s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2915,ZOE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,4603.0,8,Common,2010s,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


In [169]:
# origin names
#9 germanic names

In [170]:
germanic_names = []
for i in range(1,3):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:80/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        germanic_names.append(name)

# Create a DataFrame with the names
germanic_names = pd.DataFrame(germanic_names, columns=['names'])
print(germanic_names.shape)
germanic_names.head()

Status code: 200
Status code: 200
(263, 1)


Unnamed: 0,names
0,OSCAR
1,ERIC
2,AUDREY
3,ERNEST
4,LEONARD


In [171]:
germanic_names_result = dealing_accent(germanic_names)
germanic_names_result.shape

(268, 1)

In [172]:
names_per_year = adding_to_df(germanic_names_result, names_per_year, 'germanic_names')

In [173]:
names_per_year.loc[names_per_year['germanic_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names
28,ADELAIDE,F,194.0,188.0,172.0,177.0,166.0,152.0,170.0,159.0,146.0,143.0,146.0,147.0,142.0,...,0,7299.0,48,Common,1980s,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
29,ADELE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1492.0,7,Uncommon,2010s,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
31,ADELINE,F,224.0,218.0,204.0,227.0,227.0,215.0,179.0,182.0,198.0,168.0,174.0,158.0,156.0,...,0,52483.0,85,Very Popular,1980s,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
35,ADOLPHE,M,495.0,575.0,560.0,545.0,540.0,560.0,554.0,588.0,593.0,584.0,584.0,506.0,511.0,...,0,15335.0,43,Popular,1900s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
44,ADÈLE,F,661.0,625.0,680.0,645.0,583.0,621.0,562.0,579.0,637.0,514.0,584.0,476.0,556.0,...,0,44086.0,82,Very Popular,2010s,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2766,ULRICH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1161.0,13,Rare,1980s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2777,VALERY,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,3358.0,17,Common,1970s,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2815,WILFRIED,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,12177.0,32,Popular,1980s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2816,WILLIAM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,89.0,84.0,94.0,0.0,83.0,94.0,...,0,77753.0,112,Very Popular,1990s,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0


In [174]:
# origin names
#10 turkish names

In [175]:
turkish_names = []
for i in range(1,6):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:90/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        turkish_names.append(name)

# Create a DataFrame with the names
turkish_names = pd.DataFrame(turkish_names, columns=['names'])
print(turkish_names.shape)
turkish_names.head()

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
(687, 1)


Unnamed: 0,names
0,AVA
1,SARA
2,ADAM
3,LENA
4,NEIL


In [176]:
turkish_names_result = dealing_accent(turkish_names)
turkish_names_result.shape

(687, 1)

In [177]:
names_per_year = adding_to_df(turkish_names_result, names_per_year, 'turkish_names')

In [178]:
names_per_year.loc[names_per_year['turkish_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names
10,ABDULLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,162.0,2,Very Rare,2010s,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
21,ADA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,84.0,1,Very Rare,2020s,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
23,ADAM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,74815.0,38,Very Popular,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26,ADAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,260.0,3,Very Rare,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
32,ADEM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,11129.0,24,Popular,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2609,SOULEYMANE,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4001.0,17,Common,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2669,TARIK,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2325.0,17,Uncommon,1980s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2843,YASIN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2034.0,19,Uncommon,2000s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2882,YUSUF,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3296.0,20,Common,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [179]:
# origin names
#11 biblic names

In [180]:
biblic_names = []
for i in range(1,2):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:35/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        biblic_names.append(name)

# Create a DataFrame with the names
biblic_names = pd.DataFrame(biblic_names, columns=['names'])
print(biblic_names.shape)
biblic_names.head()

Status code: 200
(87, 1)


Unnamed: 0,names
0,FELIX
1,GABRIEL
2,GABRIELLE
3,NICOLAS
4,PAUL


In [181]:
biblic_names_result = dealing_accent(biblic_names)
biblic_names_result.shape

(91, 1)

In [182]:
names_per_year = adding_to_df(biblic_names_result, names_per_year, 'biblic_names')

In [183]:
names_per_year.loc[names_per_year['biblic_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names
11,ABEL,M,428.0,506.0,519.0,499.0,476.0,522.0,510.0,538.0,522.0,572.0,524.0,490.0,503.0,...,76,Popular,1900s,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
13,ABIGAELLE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5,Rare,2000s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
18,ABRAHAM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,Very Rare,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
23,ADAM,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38,Very Popular,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
41,ADRIEL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5,Rare,2020s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2512,SARA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52,Very Popular,2000s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2513,SARAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,61,Very Popular,1990s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2633,SUZANNE,F,4954.0,5328.0,5620.0,5775.0,6059.0,6283.0,6740.0,6813.0,7053.0,7135.0,7543.0,7194.0,7466.0,...,123,Very Popular,1920s,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2694,THEOPHILE,M,309.0,290.0,290.0,318.0,325.0,312.0,291.0,299.0,342.0,277.0,323.0,270.0,287.0,...,61,Popular,1900s,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [184]:
# origin names
#12 viking names

In [185]:
viking_names = []
for i in range(1,2):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:76/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        viking_names.append(name)

# Create a DataFrame with the names
viking_names = pd.DataFrame(viking_names, columns=['names'])
print(viking_names.shape)
viking_names.head()

Status code: 200
(98, 1)


Unnamed: 0,names
0,ERIKA
1,OSCAR
2,ERIC
3,KARINE
4,ERIK


In [186]:
viking_names_result = dealing_accent(viking_names)
viking_names_result.shape

(99, 1)

In [187]:
names_per_year = adding_to_df(viking_names_result, names_per_year, 'viking_names')

In [188]:
names_per_year.loc[names_per_year['viking_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names,viking_names
289,ASTRID,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Popular,1980s,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
399,BRENDA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Common,1990s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
726,ELSA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Very Popular,2000s,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
727,ELVIS,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Rare,1980s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
778,ERIC,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Very Popular,1960s,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,NILS,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Common,2000s,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2297,OSCAR,M,148.0,154.0,146.0,153.0,167.0,163.0,178.0,162.0,174.0,165.0,189.0,167.0,155.0,...,Popular,2010s,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2601,SOREN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Common,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2638,SWANN,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Uncommon,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [189]:
# origin names
#13 mythology names

In [190]:
mythology_names = []
for i in range(1,2):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:81/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        mythology_names.append(name)

# Create a DataFrame with the names
mythology_names = pd.DataFrame(mythology_names, columns=['names'])
print(mythology_names.shape)
mythology_names.head()

Status code: 200
(53, 1)


Unnamed: 0,names
0,DIANE
1,CONSTANCE
2,LAETITIA
3,ASIA
4,ATHENA


In [191]:
mythology_names_result = dealing_accent(mythology_names)
mythology_names_result.shape

(54, 1)

In [192]:
names_per_year = adding_to_df(mythology_names_result, names_per_year, 'mythology_names')

In [193]:
names_per_year.loc[names_per_year['mythology_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names,viking_names,mythology_names
20,ACHILLE,M,205.0,256.0,251.0,234.0,226.0,234.0,261.0,215.0,265.0,243.0,268.0,215.0,227.0,...,2010s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
307,AURORE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1980s,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
529,CLEMENCE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2010s,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
541,CLÉMENCE,F,638.0,699.0,669.0,645.0,648.0,610.0,624.0,594.0,590.0,542.0,533.0,449.0,478.0,...,2000s,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
554,CONSTANCE,F,210.0,240.0,225.0,203.0,241.0,194.0,243.0,220.0,192.0,160.0,166.0,194.0,167.0,...,2010s,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440,LAETITIA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1980s,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1671,LUNA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1762,MAIA,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2000s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2524,SELENE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2000s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [194]:
# origin names
#13 fiction names => here I have to add 3 scraping (Disney, fairy, heros)

In [195]:
disney_names = []
for i in range(1,2):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:77/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        disney_names.append(name)

# Create a DataFrame with the names
disney_names = pd.DataFrame(disney_names, columns=['names'])
print(disney_names.shape)
disney_names.head()

Status code: 200
(109, 1)


Unnamed: 0,names
0,GEORGES
1,HANNAH
2,JANE
3,JASMINE
4,HENRY


In [196]:
fairy_names = []
for i in range(1,2):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:36/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        fairy_names.append(name)

# Create a DataFrame with the names
fairy_names = pd.DataFrame(fairy_names, columns=['names'])
print(fairy_names.shape)
fairy_names.head()

Status code: 200
(62, 1)


Unnamed: 0,names
0,CINDY
1,ALICE
2,DAISY
3,ARTHUR
4,KAYLA


In [197]:
heros_names = []
for i in range(1,3):
    # assemble the url:
    start_at= str(i)
    url = "https://unprenom.fr/tous/categories:37/page-"+str(i)+"/"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the div containing the names
    name_elements = soup.select('body > div > div > div > div > div > ul >a> li > div  ')

    # Extract names from each name element
    for element in name_elements:
        # Find the name within the element
        name = element.find('h5', class_="product-name product_title").text.strip()
        heros_names.append(name)

# Create a DataFrame with the names
heros_names = pd.DataFrame(heros_names, columns=['names'])
print(heros_names.shape)
heros_names.head()

Status code: 200
Status code: 200
(141, 1)


Unnamed: 0,names
0,CHLOÉ
1,TERRY
2,MELINDA
3,HUGO
4,THEODORE


In [198]:
fiction_names = pd.concat([heros_names, disney_names, fairy_names], axis=0, ignore_index=True)

In [199]:
fiction_names.shape

(312, 1)

In [200]:
fiction_names

Unnamed: 0,names
0,CHLOÉ
1,TERRY
2,MELINDA
3,HUGO
4,THEODORE
...,...
307,TARZAN
308,AARICIA
309,PIMPRENELLE
310,SASHKA


In [201]:
fiction_names_result = dealing_accent(fiction_names)
fiction_names_result.shape

(278, 1)

In [202]:
names_per_year = adding_to_df(fiction_names_result, names_per_year, 'fiction_names')

In [203]:
names_per_year.loc[names_per_year['fiction_names']==1]

Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names,viking_names,mythology_names,fiction_names
20,ACHILLE,M,205.0,256.0,251.0,234.0,226.0,234.0,261.0,215.0,265.0,243.0,268.0,215.0,227.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
28,ADELAIDE,F,194.0,188.0,172.0,177.0,166.0,152.0,170.0,159.0,146.0,143.0,146.0,147.0,142.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
38,ADRIAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,AGNÈS,F,422.0,433.0,464.0,448.0,494.0,497.0,472.0,502.0,547.0,552.0,538.0,572.0,568.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
76,ALBERT,M,3357.0,3832.0,4086.0,4145.0,4306.0,4410.0,4383.0,4649.0,4699.0,4864.0,5022.0,4621.0,4832.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745,TONY,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2749,TRISTAN,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2798,VIVIANE,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2812,WENDY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## Df to store

In [204]:
# I want this column to stay numerical.
# -9999 means the name is not given

In [206]:
still_given = names_per_year[names_per_year['given_2018_yn']==1]

In [207]:
still_given_pop = still_given[still_given['popularity_5y']>0]

In [208]:
print(still_given.shape)
print(still_given_pop.shape)
display(still_given.head(5))
still_given_pop.head(5)

(1444, 153)
(587, 153)


Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names,viking_names,mythology_names,fiction_names
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ABDALLAH,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,ABDOUL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,names,sexe,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,...,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names,viking_names,mythology_names,fiction_names
0,AALIYAH,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AARON,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ABBY,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,ABDOUL,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,ABDOULAYE,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Export CSV

In [209]:
names_per_year.to_csv('C:/Users/tella/OneDrive/Dokumente/Ironhack/W9 Project/names_per_year.csv')

In [210]:
still_given.to_csv('C:/Users/tella/OneDrive/Dokumente/Ironhack/W9 Project/still_given.csv')
still_given_pop.to_csv('C:/Users/tella/OneDrive/Dokumente/Ironhack/W9 Project/still_given_pop.csv')

In [211]:
#data to analye: create the df names
# drop the columns years


In [213]:
names = names_per_year.drop(['1900', '1901', '1902', '1903', '1904', '1905',
       '1906', '1907', '1908', '1909', '1910', '1911', '1912', '1913',
       '1914', '1915', '1916', '1917', '1918', '1919', '1920', '1921',
       '1922', '1923', '1924', '1925', '1926', '1927', '1928', '1929',
       '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937',
       '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945',
       '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953',
       '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969',
       '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022'], axis=1)

In [217]:
print(names.shape)
names.head()

(2954, 30)


Unnamed: 0,names,sexe,given_since2013,given_since2018,given_2013_yn,given_2018_yn,popularity_10y,popularity_5y,lenght,hyphen,neutral,given_total,count,category,decade_popularity,flower_names,arabic_names,greek_names,american_names,latin_names,french_names,italian_names,celt_names,english_names,germanic_names,turkish_names,biblic_names,viking_names,mythology_names,fiction_names
0,AALIYAH,F,2166.0,1245.0,1,1,101.37931,53.684211,7,0,0,2714.0,14,Uncommon,2010s,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AARON,M,22669.0,11797.0,1,1,19.926586,1.644444,5,0,0,33494.0,25,Very Popular,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ABBY,F,1791.0,1100.0,1,1,107.894737,26.06383,4,0,0,2456.0,16,Uncommon,2010s,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ABDALLAH,M,2297.0,929.0,1,1,-40.384615,-36.213992,8,0,0,3915.0,21,Common,2010s,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABDEL,M,80.0,0.0,1,0,-100.0,-9999.0,5,0,0,1430.0,16,Uncommon,1980s,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [218]:
names.to_csv('C:/Users/tella/OneDrive/Dokumente/Ironhack/W9 Project/names.csv')