In [2]:
import numpy as np
import pandas as pd
import wget
import os
import zipfile

## Downloading and unzipping data

In [3]:
if not os.path.exists("names.zip"):
    url = 'https://www.ssa.gov/oact/babynames/names.zip'
    wget.download(url)
    with zipfile.ZipFile('names.zip', 'r') as zip_ref:
        zip_ref.extractall('txt_files/')

## Preparing the data

In [4]:
allyears_names = pd.concat(pd.read_csv(f'txt_files/yob{year}.txt', names=['name','sex','number']).assign(year=year) for year in range(1880, 2021))
allyears_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2020863 entries, 0 to 31270
Data columns (total 4 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   name    object
 1   sex     object
 2   number  int64 
 3   year    int64 
dtypes: int64(2), object(2)
memory usage: 77.1+ MB


In [5]:
allyears_names.head()

Unnamed: 0,name,sex,number,year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


In [6]:
allyears_names.isna().sum() # checking for NaN values

name      0
sex       0
number    0
year      0
dtype: int64

In [7]:
allyears_grouped = allyears_names.drop('year',axis=1).groupby(['sex', 'name']).sum()
allyears_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,number
sex,name,Unnamed: 2_level_1
F,Aabha,46
F,Aabidah,5
F,Aabriella,51
F,Aada,13
F,Aadaya,9


In [8]:
male, female = allyears_grouped.loc['M'], allyears_grouped.loc['F']

## searching for names that are no more than 2 times more common for other sex

In [9]:
ratios = (male/female)
unisex = ratios[(ratios < 2) & (ratios > 0.5)].dropna().index
unisex

Index(['Aalijah', 'Aamari', 'Aari', 'Aaris', 'Aaryn', 'Aavyn', 'Abey', 'Abrar',
       'Abriel', 'Adair',
       ...
       'Zi', 'Ziel', 'Zihan', 'Zikora', 'Zixuan', 'Ziyan', 'Zoel', 'Zohar',
       'Zyian', 'Zyrie'],
      dtype='object', name='name', length=1685)

## top 10 most common unisex names

In [10]:
common = (male.loc[unisex] + female.loc[unisex]).sort_values(ascending=False, by='number').head(10)
common

Unnamed: 0_level_0,number
name,Unnamed: 1_level_1
Jessie,278828
Riley,214993
Casey,188567
Jackie,169458
Peyton,127644
Jaime,119014
Kerry,98322
Kendall,95946
Jody,87088
Frankie,75616
