In [None]:
import copy
import seaborn as sns
from seaborn.utils import np, pd, plt, os

In [None]:
my_pkg = "..//Presentation"
imp_mods = os.sys.path 
if my_pkg not in imp_mods:
    imp_mods.append(my_pkg)
    print("Imported successfully")

In [None]:
from census_methods import CensusDataset as cd 

In [None]:
os.listdir()

In [None]:
census_df = pd.read_csv('final_Marital_Status_and_Religion_cleaned.csv')

In [None]:
census_df.info()

In [None]:
census_df.head()

## Descriptive Statistics

In [None]:
census_df.describe().transpose()

In [None]:
census_df.describe(include='object').transpose()

___

___

## AGE DISTRIBUTIONS

In [None]:
sns.histplot(data=census_df, x='Age', hue='Gender', multiple='layer')

#### Maximum Age

In [None]:
print(f"Overall maximum Age: {census_df['Age'].max()}")
print(f"Male maximum Age: {census_df.loc[census_df['Gender'] == 'M', 'Age'].max()}")
print(f"Female maximum Age: {census_df.loc[census_df['Gender'] == 'F', 'Age'].max()}")

#### Minimum Age

In [None]:
print(f"Overall maximum Age: {census_df['Age'].min()}")
print(f"Male maximum Age: {census_df.loc[census_df['Gender'] == 'M', 'Age'].min()}")
print(f"Female maximum Age: {census_df.loc[census_df['Gender'] == 'F', 'Age'].min()}")

#### Average Age

In [None]:
print(f"The average age of the entire population is {np.int_(np.round(census_df['Age'].mean(), 0))} years")
print(f"The average age of the male population is {np.int_(np.round(census_df.loc[census_df['Gender'] == 'M', 'Age'].mean(), 0))} years")
print(f"The average age of the female population is {np.int_(np.round(census_df.loc[census_df['Gender'] == 'F', 'Age'].mean(), 0))} years")

In [None]:
age_boundaries = list(range(0, census_df['Age'].max()+5, 5))

In [None]:
print(age_boundaries)

In [None]:
age_brackets = pd.cut(census_df['Age'], age_boundaries, retbins=True, include_lowest=True, right=False)[0]
age_brackets

In [None]:
age_group_freq = age_brackets.value_counts().sort_index()
age_group_freq

In [None]:
# in percentage
age_group_freq_perc = np.round(100 * age_group_freq/census_df.shape[0], 2)
age_group_freq_perc

In [None]:
# set up color patterns for value ranges
# 400 and above
threshold = 400
below_400 = age_group_freq.loc[age_group_freq < threshold]
above_400 = age_group_freq.loc[~(age_group_freq.index.isin(below_400.index))]

In [None]:
below_400

In [None]:
above_400

In [None]:
# percentage of population between ages 5 to 59 years
majority_population = age_group_freq_perc.loc[age_group_freq.loc[age_group_freq >= 400].index]
majority_population

In [None]:
# percentage of population between ages 0 to 34 years
df = age_group_freq_perc.reset_index()
df.loc[:7]['Age'].sum()

In [None]:
sns.set_style(style='whitegrid')
pallete = sns.palettes.dark_palette('darkblue', 2)

In [None]:
fig = plt.figure(figsize=(6, 4), dpi=200)
# topax, downax = fig.add_axes([0, 0, 0.9, 0.6]), fig.add_axes([0, 1, 0.9, 0.6])

below_400_plot = sns.barplot(y=below_400.index, x=below_400, color='grey')
above_400_plot = sns.barplot(y=above_400.index, x=above_400, color='darkblue')

plt.xlim(0, 700)

for i in range(len(age_group_freq.index)):
    plt.text(age_group_freq[i]+1, i, f"{age_group_freq[i]} --> {age_group_freq_perc[i]}%",
                fontsize=5, weight='bold')
    
plt.ylabel("Age Brackets")
plt.xlabel("Count")

plt.tick_params(labelsize=6)

# plt.xticks(plt.get_xticks().tolist())
# plt.xticklabels([f"{label: .0f}" for label in plt.get_xticks().tolist()], fontsize=6)

plt.title("General Age Distribution with 5-year intervals")

# plot summary
plt.text(300, 18, "{}%".format(majority_population.sum()) +
                   " of the population is aged between 5 to 59 years.",
                  color='blue', 
                   bbox={'edgecolor': 'red', 'facecolor': 'white', 'alpha': 0.5},
                  fontsize=5)

plt.show()
# plt.xticks(rotation=90);

***General age distribution plot above shows that 77.49% of the population is aged between 5 to 59 years.***

In [None]:
fname = 'age_distr_5yr_intervals.png'
cd.fig_writer(fname, fig)

#### Age Distribution for Males vs Females

In [None]:
female_ages = census_df.loc[census_df['Gender'] == 'F', 'Age']
male_ages = census_df.loc[census_df['Gender'] == 'M', 'Age']

In [None]:
fem_age_freq = age_brackets.loc[female_ages.index].value_counts().sort_index(ascending=False)
mal_age_freq = age_brackets.loc[male_ages.index].value_counts().sort_index(ascending=False)

In [None]:
fem_age_freq

In [None]:
mal_age_freq

#### MALE/FEMALE RATIO
Difference between male and female population per age bracket

#### Ratio of males to females per 1000 residents:
= 1000 * (male population/female population)

In [None]:
print(f"For every 1000 females, there are {np.round(1000 * len(male_ages)/len(female_ages), 2):.0f} males")

In [None]:
fem_per_male = np.round(1000 * fem_age_freq/mal_age_freq, 0).dropna()
fem_per_male

In [None]:
# positive for more females, 
# negative for more males
male_fem_ratio = mal_age_freq - fem_age_freq
male_fem_ratio

In [None]:
fig = plt.figure(figsize=(8, 6), dpi=200)
l, r = fig.add_axes([0, 0, 0.6, 1]), fig.add_axes([0.8, 0, 0.6, 1])


sns.barplot(x=male_fem_ratio, 
            y=male_fem_ratio.index, 
            color='darkblue',
           ax=l)

sns.barplot(x=fem_per_male, 
            y=fem_per_male.index, 
            color='darkblue',
           ax=r)

l.set_xlim(-90, 90)

# for i in range(len(male_fem_ratio.index)):
#     plt.text(male_fem_ratio[i], i, male_fem_ratio[i],
#                 fontsize=5, weight='bold')
    
l.set_ylabel("Age Brackets", size=10)
l.set_xlabel("Count", size=10)

r.set_ylabel("Age Brackets", size=10)
r.set_xlabel("Female Count per 1000 males", size=10)

# plt.tick_params(labelsize=6)

l.text(50, 20, 'Male', color='white',
              bbox={'facecolor':'black'})
l.text(-80, 20, 'Female', color='white',
             bbox={'facecolor':'black'})


l.set_title("Difference Between Male and Female Population")
r.set_title("Number of Females Per 1000 Male Residents Per Age Bracket")

# plot summary
note = f"Negative axis means more females\n" +\
"Positive axis means more males"
l.text(10, 8, note,
       color='blue', 
       bbox={'edgecolor': 'red', 'facecolor': 'white', 'alpha': 0.5},
       fontsize=10, fontweight='bold')

plt.show()
# plt.xticks(rotation=90);

In [None]:
fname = 'gender_population_diff.png'
cd.fig_writer(fname, fig)

In [None]:
sns.set_style('darkgrid')
fig = plt.figure(figsize=(6, 4), dpi=200)

male_plot = sns.barplot(x=mal_age_freq, 
                        y=mal_age_freq.index, 
                        order=mal_age_freq.index,
                        color='darkblue',
                       lw=0)
fem_plot = sns.barplot(x=-1*fem_age_freq, 
                       y=fem_age_freq.index, 
                       order=fem_age_freq.index,
                       color='darkgreen',
                      lw=0)

for i in range(age_group_freq.shape[0]):
    male_plot.text(mal_age_freq[i]+5, i, mal_age_freq[i], fontweight='bold',
                  fontsize=5)
    fem_plot.text(-1*fem_age_freq[i]-20, i+0.1, fem_age_freq[i], fontweight='bold',
                  fontsize=5)
    
plt.xlim(-400, 400)
fem_plot.set(xticklabels=[400, 300, 200, 100, 0,
                         100, 200, 300, 400], xlabel='Population', ylabel='Age Brackets')

male_plot.text(250, 1, 'Male', color='white',
              bbox={'facecolor':'darkblue'})
fem_plot.text(-250, 1, 'Female', color='white',
             bbox={'facecolor':'darkgreen'})

fem_plot.set_yticklabels(fem_plot.get_yticklabels(), size=5)

plt.title("Population Pyramid Comparing Male/Female Age Distribution", fontsize=10);

In [None]:
fname = 'age_pyramid_5yr_intervals.png'
cd.fig_writer(fname, fig)

In [None]:
fname = 'age_pyramid_5yr_intervals.png'
cd.fig_writer(fname, fig)

#### Average age of male

#### Male age 40 and below

In [None]:
husbands_forty_below = census_df.loc[(census_df['Gender'] == 'M') & 
                                     (census_df['Marital Status'] == 'Married') &
                                     (census_df['Age'] <= 40)]
husbands_forty_below.shape[0]

In [None]:
husbands = census_df.loc[(census_df['Gender'] == 'M') & 
                         (census_df['Marital Status'] == 'Married')]
husbands.shape[0]

In [None]:
100 * husbands_forty_below.shape[0]/husbands.shape[0]

Moving from one age bracket to the next, the male population

#### Female age 40 and below

In [None]:
wives_forty_below = census_df.loc[(census_df['Gender'] == 'F') & 
                                     (census_df['Marital Status'] == 'Married') &
                                     (census_df['Age'] <= 40)]
wives_forty_below.shape[0]

In [None]:
wives = census_df.loc[(census_df['Gender'] == 'M') & 
                         (census_df['Marital Status'] == 'Married')]
wives.shape[0]

In [None]:
100 * wives_forty_below.shape[0]/wives.shape[0]

#### 10 year interval

In [None]:
tenyr_age_boundaries = list(range(0, census_df['Age'].max()+10, 10))
tenyr_age_boundaries

In [None]:
tenyr_age_brac = pd.cut(census_df['Age'], tenyr_age_boundaries, right=False, include_lowest=True)
tenyr_age_brac

In [None]:
tenyr_age_freq = tenyr_age_brac.value_counts().sort_index()
tenyr_age_freq

In [None]:
tenyr_age_perc = np.round(100 * tenyr_age_freq/tenyr_age_freq.sum(), 2)
tenyr_age_perc

In [None]:
perc_pop_below_50 = tenyr_age_perc.reset_index().loc[:4, 'Age'].sum()

In [None]:
perc_pop_below_40 = np.round(tenyr_age_perc.reset_index().loc[:3, 'Age'].sum(), 2)
perc_pop_below_40

In [None]:
print(f"{perc_pop_below_50}% of the population are below 50 years\n" +
     f"{perc_pop_below_40}% of the population are below 40 years")

In [None]:
df = tenyr_age_freq.reset_index()
df

In [None]:
df.loc[:4, 'Age']

In [None]:
df.loc[5:, 'Age']

In [None]:
fig = plt.figure(figsize=(6, 4), dpi=200)
# topax, downax = fig.add_axes([0, 0, 0.9, 0.6]), fig.add_axes([0, 1, 0.9, 0.6])


below_50_ax = sns.barplot(x=df.loc[:4, 'index'], 
                          y=df.loc[:4, 'Age'], 
                          color='darkblue')

fifty_above_ax = sns.barplot(x=df.loc[5:, 'index'], 
                         y=df.loc[5:, 'Age'], 
                         color='grey')

plt.ylim(0, 1400)

for i in range(len(tenyr_age_freq.index)):
    plt.text(i-0.25, tenyr_age_freq[i]+1, tenyr_age_freq[i],
                fontsize=5, weight='bold')
    
plt.xlabel("Age Brackets")
plt.ylabel("Population")

plt.tick_params(labelsize=6)


plt.title("General Age Distribution with 10-year intervals")

# plot summary
note = f"{perc_pop_below_40}% of the population are below 40 years\n" +\
f"{perc_pop_below_50}% of the population are below 50 years"
plt.text(5, 1200, note,
                  color='blue', 
                   bbox={'edgecolor': 'red', 'facecolor': 'white', 'alpha': 0.5},
                  fontsize=8, fontweight='bold')

plt.show()
# plt.xticks(rotation=90);

In [None]:
fname = 'age_pyramid_10yr_intervals.png'
cd.fig_writer(fname, fig)

In [None]:
twentyyr_gen_age_freq = pd.cut(census_df['Age'], bins=[0, 20, 40, 60, 80, 100, 120], right=False, include_lowest=True).value_counts().sort_index()
twentyyr_gen_age_freq

In [None]:
np.round(100 * twentyyr_gen_age_freq/twentyyr_gen_age_freq.sum(), 2)

#### Average Age per House

In [None]:
ave_age_per_addr = census_df[['Address', 'Age']].groupby(by=['Address']).mean().astype(np.int_)
ave_age_per_addr.columns = ave_age_per_addr.columns.str.replace('Age', 'Average Age')
ave_age_per_addr

#### Age Distribution of Household Average Ages

In [None]:
ave_age_per_addr_freq = pd.cut(ave_age_per_addr['Average Age'], age_boundaries, retbins=True, include_lowest=True, right=False)[0].value_counts().sort_index()
ave_age_per_addr_freq

In [None]:
ave_age_per_addr_overall_perc = np.round(100 * ave_age_per_addr_freq/ave_age_per_addr_freq.sum(), 2).reset_index()
ave_age_per_addr_overall_perc.columns = ave_age_per_addr_overall_perc.columns.str.replace('Average Age', '%Age Count').str.replace('index', 'Age Bracket')
ave_age_per_addr_overall_perc

In [None]:
above_9_perc = ave_age_per_addr_overall_perc.loc[ave_age_per_addr_overall_perc['%Age Count'] > 9]
above_9_perc

In [None]:
below_9_perc = ave_age_per_addr_overall_perc.loc[~(ave_age_per_addr_overall_perc['%Age Count'] > 9)]
below_9_perc

In [None]:
below_9_perc['%Age Count'].index

In [None]:
above_9_perc['%Age Count'].index

In [None]:
sns.set_style('darkgrid')
fig = plt.figure(figsize=(6, 4), dpi=200)

plt.bar(x=above_9_perc['%Age Count'].index,
        height=above_9_perc['%Age Count'], 
            color='green',)
#             ci=None)

plt.bar(x=below_9_perc['%Age Count'].index, 
            height=below_9_perc['%Age Count'],
            color='black', )
#             ci=None)

# plt.set(xlabel='Number of Houses', ylabel='Average Age Bracket')
plt.xticks(ticks=ave_age_per_addr_overall_perc.index, 
           labels=ave_age_per_addr_overall_perc['Age Bracket'],
          rotation=90, size=5)
plt.xlabel("Age Bracket", fontsize=8)
plt.ylabel("%Age Count", fontsize=8)

for i in range(ave_age_per_addr_overall_perc.shape[0]):
    if ave_age_per_addr_overall_perc.loc[i, '%Age Count']:
        plt.text(i-0.5, ave_age_per_addr_overall_perc.loc[i, '%Age Count']+0.5,
                 f"{ave_age_per_addr_overall_perc.loc[i, '%Age Count']}%",
                 size=4, weight='bold')
        
plt.text(12, 8, f"The average ages of {np.round(above_9_perc['%Age Count'].sum(), 2)}% of Houses \n"+
        f"range from 20 to 44 years old", size=8, color='blue', 
        bbox={'edgecolor':'red', 'facecolor':'none'})

plt.title("Average Age Distribution per House");

In [None]:
fname = 'average_count_per_house.png'
cd.fig_writer(fname, fig)

#### Retired residents

In [None]:
# AGE of retired resident
census_df.loc[census_df['Is Retired'] == 1, 'Age'].shape[0]/census_df.shape[0]

In [None]:
fig = plt.figure(figsize=(10, 8), dpi=200)
l, r = fig.add_axes([0, 0, 0.8, 1]), fig.add_axes([0.95, 0, 0.8, 1])

below_400_plot = sns.barplot(y=below_400.index, x=below_400, color='grey', ax=l)
above_400_plot = sns.barplot(y=above_400.index, x=above_400, color='darkblue', ax=l)

below_50_ax = sns.barplot(x=df.loc[:4, 'index'], 
                          y=df.loc[:4, 'Age'], 
                          color='darkblue',
                         ax=r)

fifty_above_ax = sns.barplot(x=df.loc[5:, 'index'], 
                         y=df.loc[5:, 'Age'], 
                         color='grey',
                            ax=r)

l.set_xlim(0, 800), r.set_ylim(0, 1400)

for i in range(len(age_group_freq.index)):
    l.text(age_group_freq[i]+1, i, f"{age_group_freq[i]} ({age_group_freq_perc[i]}%)",
                fontsize=14, weight='bold')
    
for i in range(len(tenyr_age_freq.index)):
    r.text(i-0.25, tenyr_age_freq[i]+1, tenyr_age_freq[i],
                fontsize=14, weight='bold')
    
l.set_ylabel("Age Brackets", size=15)
l.set_xlabel("Count", size=15)
r.set_xlabel("Age Brackets", size=15)
r.set_ylabel("Population", size=15)

l.tick_params(labelsize=10)
r.tick_params(labelsize=10)


# plt.xticks(plt.get_xticks().tolist())
# plt.xticklabels([f"{label: .0f}" for label in plt.get_xticks().tolist()], fontsize=6)

r.set_title("General Age Distribution with 10-year intervals", size=18)
l.set_title("General Age Distribution with 5-year intervals", size=18)

# plot summary
l.text(200, 21, "{}%".format(majority_population.sum()) +
                   " of the population is aged between 5 to 59 years.",
                  color='blue', 
                   bbox={'edgecolor': 'red', 'facecolor': 'white', 'alpha': 0.5},
                  fontsize=16, weight='bold')

note = f"{perc_pop_below_40}% of the population are below 40 years\n" +\
f"{perc_pop_below_50}% of the population are below 50 years"
r.text(4, 1300, note,
                  color='blue', 
                   bbox={'edgecolor': 'red', 'facecolor': 'white', 'alpha': 0.5},
                  fontsize=15, fontweight='bold')

plt.show()
# plt.xticks(rotation=90);

In [None]:
fname = 'combo_gen_age_distr.png'
cd.fig_writer(fname, fig)