In [None]:
import pandas
import numpy

import matplotlib.pyplot as plt

from scipy import stats


In [None]:
df = pandas.read_csv(
    "https://course-resources.minerva.edu/uploaded_files/mu/00294347-0809/sf-sea-level-rise-sample.csv",
)[["NonWhite", "Education"]]

white = df[df["NonWhite"] < 0.5]
white_education = white["Education"]

non_white = df[df["NonWhite"] >= 0.5]
non_white_education = non_white["Education"]

In [None]:
print("Whole dataset")
print(df.describe())


In [None]:
print("White")
print(white.describe())


In [None]:
print("Non-white")
print(non_white.describe())


In [None]:
print(f"NonWhite median, Education median for whole dataset")
print(df["NonWhite"].median(), df["Education"].median())


In [None]:
print(f"NonWhite median, Education median for white")
print(white["NonWhite"].median(), white_education.median())


In [None]:
print(f"NonWhite median, Education median for black")
print(non_white["NonWhite"].median(), non_white_education.median())


In [None]:
# creates a frequency distribution of non-white proportion across the sample census blocks
df.hist(column="NonWhite", bins=30)

# draws the line between white-majority blocks (> 50% white) and white-minority blocks (<50%)
plt.axvline(0.5, color="r")
plt.title("Frequency Distribution of NonWhite Census Blocks")
plt.xlabel("Percentage of Non-White People")
plt.ylabel("Frequency")
plt.show()


In [None]:
bins = 15
w_range = max(white["Education"]) - min(white["Education"])
nw_range = max(non_white["Education"]) - min(non_white["Education"])
max_range = max(w_range, nw_range)
space_per_bin = max_range / bins
w_bins = int(w_range / space_per_bin)
nw_bins = int(nw_range / space_per_bin)

plt.hist(non_white["Education"], edgecolor = "white", color = "blue", label = "Non-White Majority", alpha = 0.7, bins = nw_bins)
plt.hist(white["Education"], edgecolor = "white", color = "green", label = "White Majority", alpha = 0.7, bins = w_bins)
plt.xlabel("Percentage of Adults over age 25 with High School Diplomas")
plt.ylabel("Frequency")
plt.title("Proportion of High School Graduates Across Census Blocks of Varying Racial Diversity")
plt.legend()
plt.show()

In [None]:
def difference_of_means_test(control, treatment, tails=1):
    n_1, n_2 = len(control), len(treatment)
    x_1, x_2 = numpy.mean(control), numpy.mean(treatment)

    # sample SD formula (aka Bessel’s correction): use n-1 in denominator
    s_1, s_2 = numpy.std(control, ddof=1), numpy.std(treatment, ddof=1)

    # conservative estimate from OpenIntro
    DOF = min(n_1, n_2) - 1

    SE = numpy.sqrt(s_1**2 / n_1 + s_2**2 / n_2)
    t_score = numpy.abs((x_2 - x_1)) / SE
    p_value = tails * stats.t.cdf(-t_score, DOF)

    # OpenIntro section 5.3.6
    pooled_SD = numpy.sqrt(
        (s_1**2 * (n_1 - 1) + s_2**2 * (n_2 - 1)) / (n_1 + n_2 - 2)
    )
    cohen_d = (x_2 - x_1) / pooled_SD

    return p_value, cohen_d


# this function creates an upper and lower bound for a confidence interval
def confidence_interval(point_estimate, SE, conf_level):
    z = stats.norm.ppf(
        1 - (1 - conf_level) / 2
    )  # gives 1.96 for a 95% confidence level
    lowbound = point_estimate - z * SE
    highbound = point_estimate + z * SE
    return lowbound, highbound


In [None]:
p_value, cohen_d = difference_of_means_test(
    white_education, non_white_education, tails=2
)
print(f"p-value: {p_value}, Cohen's d: {cohen_d}")


In [None]:
print(
    f'Whole: CI 95%: {confidence_interval(numpy.mean(df["Education"]), numpy.std(df["Education"]) / numpy.sqrt(len(df)), 0.95)}'
)


In [None]:
print(
    f"White: CI 95%: {confidence_interval(numpy.mean(white_education), numpy.std(white_education) / numpy.sqrt(len(white)), 0.95)}"
)


In [None]:
print(
    f"Non-white: CI 95%: {confidence_interval(numpy.mean(non_white_education), numpy.std(non_white_education) / numpy.sqrt(len(non_white)), 0.95)}"
)
