In [None]:
import copy
import seaborn as sns
from seaborn.utils import np, plt, pd, os

In [None]:
my_pkg = "..//Presentation"
imp_mods = os.sys.path 
if my_pkg not in imp_mods:
    imp_mods.append(my_pkg)
    print("Imported successfully")

In [None]:
from census_methods import CensusDataset as cd 

In [None]:
os.listdir()

In [None]:
census_df = pd.read_csv('final_Marital_Status_and_Religion_cleaned.csv')

In [None]:
census_df.info()

In [None]:
census_df.head()

## Descriptive Statistics

In [None]:
census_df.describe().transpose()

In [None]:
census_df.describe(include='object').transpose()

___

___

## RESIDENCY DETAILS:

#### Number of Streets

In [None]:
census_df['Street'].unique().shape[0]

#### Total Number of Houses

In [None]:
houses_in_the_city = census_df['Address'].unique().shape[0]
houses_in_the_city

#### Number of Houses per resident
Houses per 1000 resident = (Total population/Total number of houses) * 1000<br>
Number of houses available per 1000 residents

In [None]:
print(f"There are {1000 * houses_in_the_city/census_df.shape[0]:.0f} houses for each 1000 resident")

#### House Occupancy Level
Occupancy level = Total Population/Number of Houses

In [None]:
occupancy_lvl = census_df.shape[0]/houses_in_the_city
print(f"Houses are occupied at rate of {occupancy_lvl:.0f} occupants per house")

#### Number of Occupants per House Address

In [None]:
house_occupancy = census_df['Address'].value_counts()
house_occupancy

In [None]:
one_occup_house = house_occupancy.loc[house_occupancy <= 1]
one_occup_house

In [None]:
two_occup_house = house_occupancy.loc[house_occupancy <= 2]
two_occup_house

In [None]:
print(f"There {one_occup_house.shape[0]} houses having only one occupant.\n" +\
      f"{100 * one_occup_house.shape[0]/house_occupancy.shape[0]:.2f}% of total number of houses")

In [None]:
print(f"There {two_occup_house.shape[0]} houses having only two occupants.\n" +\
      f"{100 * two_occup_house.shape[0]/house_occupancy.shape[0]:.2f}% of total number of houses")

In [None]:
two_occup_house = house_occupancy.loc[house_occupancy == 2]
two_occup_house

In [None]:
print(f"There {house_occupancy.shape[0] - one_occup_house.shape[0]} houses having only one occupant.\n" +\
      f"{100 * (1 - (one_occup_house.shape[0]/house_occupancy.shape[0])):.2f}% of total number of houses")

In [None]:
two_occup_house.sum()

In [None]:
two_occup_house = house_occupancy.loc[house_occupancy == 2]
two_occup_house

In [None]:
house_occupancy.loc[house_occupancy == 3].sum()

In [None]:
house_occupancy.loc[house_occupancy == 4].sum()

In [None]:
house_occupancy.loc[house_occupancy == 5].sum()

In [None]:
house_occupancy.loc[house_occupancy == 6].sum()

In [None]:
house_occupancy.loc[house_occupancy == 7].sum()

In [None]:
house_occupancy.loc[house_occupancy == 8].sum()

In [None]:
house_occupancy.loc[house_occupancy == 9].sum()

In [None]:
house_occupancy.loc[house_occupancy > 5 ].sum()

In [None]:
house_occupancy_freq = house_occupancy.value_counts().sort_index()
# house_occupancy_freq.columns = house_occupancy_freq.columns.str.replace("Address", "House Count (%)").str.replace("index", "Occupant Count")
house_occupancy_freq

In [None]:
dd = house_occupancy_freq.loc[house_occupancy_freq.index > 5] * np.array(house_occupancy_freq.loc[house_occupancy_freq.index > 5].index)
dd.sum()

In [None]:
house_occupancy.loc[house_occupancy == 2]

In [None]:
census_df.loc[census_df['Address'].isin(house_occupancy.loc[house_occupancy == 2].index)]

In [None]:
house_occupancy_perc = np.round(100 * house_occupancy_freq/house_occupancy_freq.sum(), 2)#.reset_index()
# house_occupancy_perc.columns = house_occupancy_perc.columns.str.replace("Address", "House Count (%)").str.replace("index", "Occupant Count")
house_occupancy_perc

In [None]:
above_5_perc = house_occupancy_perc.loc[house_occupancy_perc > 5]
above_5_perc

In [None]:
below_6_perc = house_occupancy_perc.loc[~house_occupancy_perc.index.isin(above_5_perc.index)]
below_6_perc

In [None]:
sns.set_style(style='whitegrid')
fig = plt.figure(figsize=(6, 4), dpi=200)
ax1, zoomed = fig.add_axes([0, 0, 1, 1]), fig.add_axes([1.09, 0.4, 0.6, 0.6])

abax = sns.barplot(x=above_5_perc.index, y=above_5_perc,
           color='green', ax=ax1)

beax = sns.barplot(x=below_6_perc.index, y=below_6_perc,
           color='red', ax=zoomed)

zoomed.set_xticklabels(zoomed.get_xticklabels(), size=6)

for i, lab in enumerate(above_5_perc.index):
    ax1.text(i-0.25, above_5_perc[lab]+0.25, f"{house_occupancy_freq[lab]} ({above_5_perc[lab]}%)",
            size=10, weight='bold')

for i, lab in enumerate(below_6_perc.index):
    zoomed.text(i, below_6_perc[lab]+0.05, f"{house_occupancy_freq[lab]} ({below_6_perc[lab]}%)",
            size=8, rotation=50, weight='bold')

ax1.set_ylabel("Number of Houses (in %)"), ax1.set_xlabel("Number of Occupants")
zoomed.set_ylabel("Number of Houses (in %)", size=7), zoomed.set_xlabel("Number of Occupants", size=7)

zoomed.text(2, 3, "Number of houses with more than 5 Occupants", color='white', size=8,
           bbox={'edgecolor':'none', 'facecolor':'red'},
           weight='bold')

ax1.text(0.9, 38, "Number of houses with 5 Occupants or less", color='white', size=12,
           bbox={'edgecolor':'none', 'facecolor':'green'},
           weight='bold')
zoomed.set_ylim(top=3), zoomed.set_xlim(right=13)
ax1.set_ylim(top=40)

ax1.set_title("Percentage Number of Houses per Occupancy Level", weight='bold', 
              y=1.05, size=15)

In [None]:
fname = 'percentage_occupancy_level.png'
cd.fig_writer(fname, fig)

#### Distribution of the number of people per house address

In [None]:
q1, q3 = np.int_(np.percentile(house_occupancy, [25, 75]))
iqr = q3 - q1
print(f"Q1: {q1}, Q3: {q3}\nIQR: {iqr}")

In [None]:
iqr_coeff = iqr * 1.5
low_lim, upp_lim = np.int_([q1 - iqr_coeff, q3 + iqr_coeff])
print(low_lim, upp_lim)

In [None]:
sns.set_style(style='darkgrid')
fig = plt.figure(figsize=(3, 4), dpi=200)

bplot = sns.boxplot(y=house_occupancy)#, x=house_occupancy)
bplot.set_title("Box Plot Showing Number of People per House Address", fontsize=10)

bplot.text(0.1, 9, f"Houses of more than {upp_lim}\n occupants are outliers",
           fontsize=5, fontweight='bold', color='blue',
            bbox={'edgecolor':'red', 'facecolor': 'white', 'alpha': 0.3});

Based on the above statistics and box plot, ***normally 8 or less occupants should live in a house.***<br>
Therefore, I shall be using this figure as my threshold to determine where a house is over-populated.

In [None]:
overpopulated_houses = house_occupancy.loc[house_occupancy > 8]
overpopulated_houses

In [None]:
normal_occup_lvl = house_occupancy.loc[~house_occupancy.index.isin(overpopulated_houses.index)]
normal_occup_lvl

In [None]:
overpopulated_houses.shape[0], normal_occup_lvl.shape[0]

In [None]:
sns.set_style(style='darkgrid')
fig = plt.figure(figsize=(6, 4), dpi=200)

outl = sns.barplot(x=overpopulated_houses, y=overpopulated_houses.index, color='darkblue')
outl.set_title(f"{overpopulated_houses.shape[0]} Over-populated Houses (With Above 8 Occupants)", 
               fontsize=12, fontweight='bold')

outl.set_yticklabels(outl.get_yticklabels(), fontsize=4)
outl.set_xlabel("Number of Occupants"), outl.set_ylabel("House")

for i in range(overpopulated_houses.shape[0]):
    outl.text(overpopulated_houses[i]+0.15, i, overpopulated_houses[i], fontweight='bold',
             fontsize=4);

#### Median Number of Occupants per House Address

In [None]:
median_occup = np.round(house_occupancy.median(), decimals=0)
print("The median number of people per house is {:.0f}".format(median_occup))

#### Average Number of Occupants per House Address

In [None]:
average_occup = np.round(house_occupancy.mean(), 0)
print("The Average number of people living in a house is {:.0f}".format(average_occup))

#### Number of houses per Street

In [None]:
# remove duplicate addresses - gives us an of instance unique addresses
# drop the address column
# count the house numbers
street_housing = census_df[['Street', 'House Number', 'Address']].drop_duplicates(subset=['Address']).drop(['Address'], axis=1).groupby(by=['Street']).count().sort_values(by=['House Number', 'Street'], ascending=[False, False])
street_housing.columns = street_housing.columns.astype(sns.utils.np.str_).str.replace('House Number', 'Number of Houses')
street_housing

In [None]:
# Top fifteen streets with the highest number of houses
top_15 = street_housing.iloc[:15].reset_index()
top_15

In [None]:
# Top ten streets with the least number of houses
least_15 = street_housing.loc[street_housing['Number of Houses'].between(1, 10)].tail(20).reset_index()
least_15

In [None]:
sns.set_style(style='darkgrid')
fig = plt.figure(figsize=(8, 4), dpi=200)
t, b = fig.add_axes([0, 0, 0.9, 0.6]), fig.add_axes([0, 1.05, 0.9, 0.6])

t15 = sns.barplot(data=top_15, y='Number of Houses', x='Street', hue='Number of Houses', 
                  color='darkblue', ax=t)
b15 = sns.barplot(data=least_15, y='Number of Houses', x='Street', hue='Number of Houses', 
                  color='darkblue', ax=b)
# l5 = sns.barplot(x=last_top['Number of Houses'], y=last_top['Street'], color='grey')

t15.legend().remove(), b15.legend().remove()

# sns.utils.plt.yticks(ticks=list(range(top_15['Street'].shape[0])), labels=top_15['Street'].to_list())
# sns.utils.move_legend(t15, (1.02, 0))
# sns.utils.move_legend(b15, (1.02, 0))

t15.set_xticklabels(t15.get_xticklabels(), rotation=90)
b15.set_xticklabels(b15.get_xticklabels(), rotation=90);

#### Occupants per Street

In [None]:
street_occupancy = census_df['Street'].value_counts()
street_occupancy

In [None]:
sns.set_style(style='darkgrid')
fig = plt.figure(figsize=(10, 6), dpi=200)

plotter = sns.barplot(x=street_occupancy.index, y=street_occupancy, color='darkblue')
plotter.set_xticklabels(plotter.get_xticklabels(), rotation=90, fontsize=5)

plotter.set_title("Number of Occupants per Street")

for i in range(street_occupancy.shape[0]):
    plotter.text(i, street_occupancy[i]+1, street_occupancy[i],
                fontweight='bold', fontsize='xx-small', rotation=90);

In [None]:
street_occupancy = street_occupancy.reset_index()
street_occupancy.columns = street_occupancy.columns.astype(sns.utils.np.str_).str.replace('Street', 'Occupants').str.replace('index', 'Street')
street_occupancy

In [None]:
ave_occup = street_occupancy['Occupants'].mean()
print(f"Average number of occupants: {ave_occup: .0f}")

In [None]:
median_occup = street_occupancy['Occupants'].median()
print(f"Median number of occupants: {median_occup: .0f}")

In [None]:
q1, q3 = street_occupancy['Occupants'].quantile(.25), street_occupancy['Occupants'].quantile(.75)
iqr = q3 - q1
print("Q1: {}\nQ3: {}\nIQR: {}".format(q1, q3, iqr))

In [None]:
# outlier = ave +/- (1.5 * iqr)
iqr_coeff = 1.5 * iqr
llim = q1 - iqr_coeff
ulim = q3 + iqr_coeff
print("Outliers for Occupancy levels is \n" +
      f"number of occupants below {llim: .0f} or above {ulim: .0f}")

In [None]:
sns.set_style(style='darkgrid')
fig = plt.figure(figsize=(4, 4), dpi=200)

bxplot = sns.boxplot(y=street_occupancy['Occupants'])
bxplot.set_title("Box plot Showing the Distribution of Number of Occupants per Street")

bxplot.set_ylabel("Number of Occupants")

bxplot.text(0.1, 284, f"Streets containing above {ulim: .0f} houses \nare considered to be Outliers",
           fontsize=5, fontweight='bold',
            bbox={'edgecolor':'red', 'facecolor': 'white', 'alpha': 0.3})

In [None]:
# values that are 2 standard deviations above or below the median number of occupants
# shall be considered to be outliers
threshold_num_houses = (llim, ulim)
print(threshold_num_houses)

In [None]:
outliers = street_occupancy.loc[street_occupancy['Occupants'] > ulim]
outliers

In [None]:
non_outliers = street_occupancy.loc[~(street_occupancy['Occupants'] > ulim)]
non_outliers

In [None]:
sns.set_style(style='darkgrid')
fig = plt.figure(figsize=(10, 6), dpi=200)
ax1 = fig.add_axes([0, 0, 1, 1])

extr = sns.barplot(x=outliers.index, y=outliers['Occupants'], color='red', ax=ax1)
ax2 = ax1.twinx()
nextr = sns.barplot(x=non_outliers.index, y=non_outliers['Occupants'], color='green', ax=ax2)

ax1.set_xticklabels(ax1.get_xticklabels(), rotation=90, fontsize=5)

ax1.set_title("Number of Occupants per Street")

# for i in range(street_occupancy.shape[0]):
#     ax1.text(i, street_occupancy['Occupants'][i]+1, street_occupancy['Occupants'][i],
#                 fontweight='bold', fontsize='xx-small', rotation=90);

#### Number of Families in the city

In [None]:
census_df['Surname'].unique().shape[0]

#### Size of Each Family

#### Number of Occupants per House Address

In [None]:
family_members = census_df[['Address', 'Surname', 'First Name']].groupby(by=['Address', 'Surname']).count().reset_index()
family_members.columns = family_members.columns.astype(np.str_).str.replace('First Name', 'Count')
family_members = family_members.sort_values('Count', ascending=False)
family_members

In [None]:
family_members.loc[family_members['Surname'] == 'Smith']

In [None]:
census_df.loc[census_df['Address'] == '23, James Views']

#### Number of Households per Address

In [None]:
households_per_address = census_df.loc[census_df['Relationship to Head of House'] == 'Head', ['Relationship to Head of House', 'Address']].value_counts().reset_index()
households_per_address.columns = households_per_address.columns.astype(np.str_).str.replace('0', 'Number of Households')

In [None]:
households_per_address

In [None]:
multiple_heads = households_per_address.loc[households_per_address['Number of Households'] > 1]
multiple_heads

In [None]:
census_df.loc[census_df['Address'] == multiple_heads['Address'].values[0]]

In [None]:
multiple_heads['Address'].str[4:].values[0]

In [None]:
# occupants of Leonard Mews street
census_df.loc[census_df['Street'] == multiple_heads['Address'].str[4:].values[0]].iloc[:15]

In [None]:
census_df.groupby(by=['Address']).size()

In [None]:
census_df.loc[census_df['Relationship to Head of House'] == 'Head', ['Address', 'Relationship to Head of House']].groupby(by=['Address']).count()

___