In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

> # Endings are not always bad.

![](https://i.pinimg.com/736x/f1/d6/ff/f1d6ffe71ba1ce8a384c180f570dd455--the-book-thief-notebook-covers.jpg)
# About Dataset
The dataset contains structured information on the life, work, and death of more than 1 million deceased famous people.

# Paper abstract (ICWSM proceedings)
We developed a five-step method and inferred birth and death years, binary gender, and occupation from community-submitted data to all language versions of the Wikipedia project. The dataset is the largest on notable deceased people and includes individuals from a variety of social groups, including but not limited to 107k females, 124 non-binary people, and 90k researchers, who are spread across more than 300 contemporary or historical regions. The final product provides new insights into the demographics of mortality in relation to gender and profession in history. The technical method demonstrates the usability of the latest text mining approaches to accurately clean historical data and reduce the missing values.

# Import Necessary Libararies

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio

# Load Dataset

In [3]:
df = pd.read_csv('../input/age-dataset/AgeDataset-V1.csv')

# Explore Data Set

In [4]:
df.sample(10)

## Plotting to see Nulls

In [5]:
# Plot to see null values in our data
sns.heatmap(df.isnull(), cbar=False)

In [6]:
# Null Value Percentage Per column
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

# Data Preprocessing

In [7]:
df.dtypes

In [8]:
for elem in df['Gender'].unique():
    print(elem)

In [9]:
df.loc[df['Gender'].isin(['Transgender Female','Transgender Male']), 'Gender'] = 'Transgender'

In [10]:
data = df
gender_df = data[['Gender']].groupby(['Gender']).size().reset_index(name='Count').sort_values('Count', ascending = False)

In [11]:
gender_df

In [12]:
data = df
country_df = data[['Country']].groupby(['Country']).size().reset_index(name='Count').sort_values('Count', ascending = False)

In [13]:
country_df.head(10)

In [14]:
data = df
occupation_df = data[['Occupation']].groupby(['Occupation']).size().reset_index(name='Count').sort_values('Count', ascending = False)

In [15]:
occupation_df.head(10)

In [16]:
data = df
Manner_of_death = data[['Manner of death']].groupby(['Manner of death']).size().reset_index(name='Count').sort_values('Count', ascending = False)

In [17]:
Manner_of_death.head(10)

# EDA

## Top 15 Countries Distribution Chart

In [18]:
country_dfa = country_df.head(15)

fig = px.pie(country_dfa, values='Count', names='Country', title='Top 15 Countries Distribution Chart', template = 'plotly_dark')
fig.show()

## Gender Distribution Chart

In [19]:
gender_dfa = gender_df.head(3)
fig = px.pie(gender_dfa, values='Count', names='Gender', title='Gender Distribution Chart', template = 'plotly_dark')
fig.show()

## Top 15 Occupations Among Famous People

In [20]:
occupation_dfa = occupation_df.head(15).sort_values('Count', ascending = True)

fig = px.bar(occupation_dfa, x='Count', y='Occupation', template = 'plotly_dark', title='Top 15 Occupations Among Famous People', color='Count', height=800)
fig.show()

In [21]:
df.head(5)

In [22]:
Manner_of_death_df = Manner_of_death.head(10).sort_values('Count', ascending = True)

fig = px.bar(Manner_of_death_df, x='Count', y='Manner of death', template = 'plotly_dark', title='Top 10 Causes of Death', color='Count', height=800)
fig.show()

In [23]:
from wordcloud import WordCloud 

Manner_of_death_dfa = Manner_of_death.head(25).sort_values('Count', ascending = True)

data = Manner_of_death_dfa['Manner of death'].value_counts().to_dict()
wc = WordCloud(width =1920, 
        height =1080, background_color= 'black').generate_from_frequencies(data)

plt.imshow(wc)
plt.axis('off')
plt.show()

![](https://images.unsplash.com/photo-1533601017-dc61895e03c0?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1170&q=80)