In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dataf = pd.read_csv("/share/data/school_choice/Data/Cleaned/student_1920.csv", sep=",")
dataf.head(10)

In [None]:
mean_AALPI = np.mean(dataf["AALPI Score"])
mean_AALPI

In [None]:
print(dataf.columns)

In [None]:
#Metric 1 for Socioeconomic diversity is Representativeness
# Ensure schools' average SES is within x% of district average
dataf["N'hood SES Score"]

In [None]:
dataf.groupby("census_tract").count()

In [None]:
#Metric 2 for Socioeconomic diversity is Limited Isolation
# Ensure that each schools' fraction of socioeconomically disadvantaged students (SED) is at most x% 
dataf.groupby("census_tract").mean()["N'hood SES Score"]

In [None]:
dataf.groupby(["ctip1"]).count()

In [None]:
#Metric 3 for Socioeconomic diversity is exposure 
# Ensure that each schools' fraction of socioeconomically disadvantaged students is at least x% 

In [None]:
#Metric 1 of Racial diversity is Race and SES
#Ensure that at most x% of AALPI are assigned in any school with fraction of SED higher than y%

In [None]:
dataf["ctip1"].value_counts()

In [None]:
dataf["homelang_desc"]

In [None]:
dataf["homelang_desc"].value_counts()

In [None]:
dataf["englprof_desc"].value_counts()

In [None]:
group_ethnicity = dataf["resolved_ethnicity"].value_counts()
group_ethnicity

In [None]:
dataf.resolved_ethnicity.value_counts().sort_values().plot(kind = 'barh')

In [None]:
df = pd.read_csv("~/displacement-typologies/data/outputs/typologies/SanFrancisco_typology_output.csv", sep=",")
df.head(10)

In [None]:
df["GEOID"]

In [None]:
#SAE: Stable Advanced Exclusive Typology
#AdvG: Advanced Gentrification Typology
#ARE: At Risk of Being Exclusive Typology
#BE: Becoming Exclusive
#SMMI: Stable Middle/Moderate Income
#ARG: At Risk of Gentrification
#EOG: Early Ongoing Gentrification
#OD: Ongoing Displacement
cols = {"SAE": None, "AdvG": None, "ARE": None, "BE": None, "SMMI": None, "ARG": None, "EOG": None, "OD": None}
filtered_df = df[cols]
filtered_df.head(20)

In [None]:
filtered_df["SAE"].value_counts()

In [None]:
filtered_df["AdvG"].value_counts()

In [None]:
filtered_df["ARE"].value_counts()

In [None]:
filtered_df["BE"].value_counts()

In [None]:
filtered_df["SMMI"].value_counts()

In [None]:
filtered_df["ARG"].value_counts()

In [None]:
filtered_df["EOG"].value_counts()

In [None]:
filtered_df["OD"].value_counts()

In [None]:
for key in cols: 
    new_val = len(filtered_df[filtered_df[key] == 1][key])
    cols[key] = new_val

In [None]:
dataf["census_blockgroup"]

In [None]:
dataf["census_tract"]

In [None]:
step1_df = dataf.merge(df, left_on = "census_tract", right_on = "GEOID")
new_df = step1_df[["GEOID", "SAE", "AdvG", "ARE", "BE", "SMMI", "ARG", "EOG", "OD"]]
new_df = new_df.drop_duplicates()
new_df.head(30)

In [None]:
new_df["SAE"].value_counts()[1]

In [None]:
frequencies = {}
for col in new_df.columns: 
    if col != "GEOID": 
        frequencies[col] = new_df[col].value_counts()[1]

'''col_names = sorted(frequencies, key=frequencies.get)
frequencies_list = sorted(frequencies.values())
frequencies_list'''

col_names = ["OD", "ARG", "EOG", "AdvG", "SMMI", "ARE", "BE", "SAE"]
frequencies_list = []
for col in col_names:
    frequencies_list.append(frequencies[col])

In [None]:
col_names

In [None]:
frequencies

In [None]:
plt.bar(col_names, frequencies_list)
#distribution of gentrification statues across census tracts in SFUSD

In [None]:
ctip1_df = step1_df[step1_df["ctip1"] == 1]
ctip1_df = ctip1_df[["GEOID", "SAE", "AdvG", "ARE", "BE", "SMMI", "EOG", "OD"]]
ctip1_df = ctip1_df.drop_duplicates()

In [None]:
new_frequencies = {}
for col in ctip1_df.columns: 
    if col != "GEOID": 
        new_frequencies[col] = ctip1_df[col].value_counts()[1]

'''column_names = sorted(new_frequencies, key=new_frequencies.get)
frequency_list = sorted(new_frequencies.values())
frequency_list'''
column_names = ["OD", "ARG", "EOG", "AdvG", "SMMI", "ARE", "BE", "SAE"]
frequency_list = []
for col in column_names:
    if col in new_frequencies: 
        frequency_list.append(new_frequencies[col])
    else: 
        frequency_list.append(0)

In [None]:
frequency_list

In [None]:
new_frequencies

In [None]:
plt.bar(column_names, frequency_list)
#distribution of gentrification statues across census tracts in SFUSD with ctip1 scores 
#can see that the majority of CTIP1 students are living in areas with advanced gentrification 

In [None]:
#compare proportions for CTIP1 and rest of the population 
#also look at ctip2 regions to see how they compare to ctip1 

In [None]:
labels = ["All Students", "CTIP1 Students"]

In [None]:
new_cols = ["At Risk of Gentrification", "Early Ongoing Gentrification", "Ongoing Displacement", "Stable Advanced Exclusive",
           "At Risk of Being Exclusive", "Becoming Exclusive", "Advanced Gentrification", "Stable Middle/Moderate Income"]
plt.bar(column_names, frequencies_list)
plt.bar(column_names, frequency_list)
plt.legend(labels)
#compare the distribution of gentrification statuses for CTIP1 vs all students in SFUSD 

In [None]:
#change the axis to percentages 
# also create a map of the gentrification areas 

In [None]:
#normalized distribution of all student's locations in terms of gentrification
normal = [float(i)/sum(frequencies_list) for i in frequencies_list] 
new_normal = [i*100 for i in normal]
new_normal

In [None]:
plt.bar(col_names, new_normal)
plt.ylabel("Percentage of Students")
plt.xlabel("Displacement Typology")

In [None]:
#NORMALIZED distribution of displacements for CTIP1 students
ctip1 = [float(i)/sum(frequency_list) for i in frequency_list] 
new_ctip1 = [i*100 for i in ctip1]
new_ctip1

In [None]:
#Over 35% of CTIP1 students live in areas with advanced displacement 
plt.bar(col_names, new_ctip1)
plt.ylabel("Percentage of CTIP1 Students")
plt.xlabel("Displacement Typology")

In [None]:
#SAE: Stable Advanced Exclusive Typology (8)
#AdvG: Advanced Gentrification Typology (4)
#ARE: At Risk of Being Exclusive Typology (6)
#BE: Becoming Exclusive (7)
#SMMI: Stable Middle/Moderate Income (5)
#ARG: At Risk of Gentrification (2)
#EOG: Early Ongoing Gentrification (3)
#OD: Ongoing Displacement (1)
# place terms in ascending order of gentrification / displacement 
["OD", "ARG", "EOG", "AdvG", "SMMI", "ARE", "BE", "SAE"]

In [None]:
block_df_dict = pd.read_excel("/share/data/school_choice/Data/SF 2010 blks 022119 with field descriptions (1).xlsx", None, engine='openpyxl')

In [None]:
field_descriptions = block_df_dict["field descriptions"]
field_descriptions = field_descriptions.dropna(axis=0, how='all')
field_descriptions = field_descriptions.dropna(axis=1, how='all')
new_df = field_descriptions[field_descriptions["Field Name"].str.contains('CTIP')]
new_df

In [None]:
block_database = block_df_dict["block database"]
block_database = block_database.dropna(axis=0, how='all')
block_database = block_database.dropna(axis=1, how='all')
block_database["CTIP_2013 assignment"]


In [None]:
field_descriptions.head(30)

In [None]:
block_database["Tract"]
new_blockdf = block_database[["Tract", "CTIP_2013 assignment"]]
new_blockdf

In [None]:
ctip_data = new_blockdf.merge(df, left_on = "Tract", right_on = "GEOID", how = "left")
ctip_data = ctip_data[["Tract", "CTIP_2013 assignment", "OD", "ARG", "EOG", "AdvG", "SMMI", "ARE", "BE", "SAE"]]

In [None]:
ctip_data = ctip_data.dropna(axis=0, how='all')
ctip_data = ctip_data.dropna(axis=1, how='all')
ctip_data

In [None]:
ctip_data_new = ctip_data.drop_duplicates()
ctip_data_new.head(30)

In [None]:
ctip_data_final = ctip_data.drop_duplicates()
ctip_data_final

In [None]:
# Set the figure size
plt.figure(figsize=(8, 8))

# grouped barplot
sns.barplot(x="Tract", y="OD", 
            hue="CTIP_2013 assignment", data=ctip_data_final, ci=None);

In [None]:
ctip_data_new = ctip_data_new.groupby("CTIP_2013 assignment").sum()
ctip_data_new

In [None]:
ctip_data_removed = ctip_data_new.drop("Tract", axis=1)
ctip_data_removed

In [None]:
sns.heatmap(ctip_data_removed)

In [None]:
updated_FRL = pd.read_excel("/share/data/school_choice/dssg/riyab_FRL_data.xlsx", None, engine='openpyxl')
updated_FRL

In [None]:
grouped_Geoid = updated_FRL["Grouped GeoID External"]
grouped_Geoid.tail(10)

In [None]:
grouped_Geoid_filtered = grouped_Geoid[grouped_Geoid["Geoid Group"].astype('str').str.len() > 3]
grouped_Geoid_filtered

In [None]:
percent_FRL = grouped_Geoid_filtered["4YR AVG FRL Count"]/grouped_Geoid_filtered["4YR AVG Student Count"]
percent_FRL

In [None]:
plt.hist(percent_FRL, density = True, bins = 10, rwidth = .9, edgecolor='red')
plt.xlabel("Percentage of FRL Students per Block")
plt.ylabel("Frequency")

In [None]:
percent_AALPI = grouped_Geoid_filtered["4YR AVG Eth Flag Count"]/grouped_Geoid_filtered["4YR AVG Student Count"]
percent_AALPI

In [None]:
plt.hist(percent_AALPI, density = True, bins = 10, rwidth = .9, edgecolor='red')
plt.xlabel("Percentage of AALPI Students per Block")
plt.ylabel("Frequency")

In [None]:
percent_both = grouped_Geoid_filtered["4YR AVG Combo Flag Count"]/grouped_Geoid_filtered["4YR AVG Student Count"]
percent_both

In [None]:
plt.hist(percent_both, density = True, bins = 10, rwidth = .9, edgecolor='red')
plt.xlabel("Percentage of AALPI and FRL Students per Block")
plt.ylabel("Frequency")

In [None]:
block_database["Block"]

In [None]:
block_database["CTIP_2013 assignment"]

In [None]:
grouped_Geoid_filtered["Geoid Group"]

In [None]:
updated_FRL["Field Description"]["Description"][0]

In [None]:
new_merge = grouped_Geoid_filtered.merge(block_database, left_on = "Geoid Group", right_on = "Block")
new_merge

In [None]:
new_merge_grouped = new_merge.groupby("CTIP_2013 assignment").mean()
new_merge_grouped

In [None]:
new_merge_grouped = new_merge_grouped[["4YR AVG Student Count", "4YR AVG FRL Count",
                                      "4YR AVG Eth Flag Count", "4YR AVG Combo Flag Count"]]
new_merge_grouped

In [None]:
x_axis_labels = ["Student", "FRL", "AALPI", "Combo"]
ax = sns.heatmap(new_merge_grouped, xticklabels=x_axis_labels)
plt.title("4YR Average of Student Demographic Counts", fontsize = 15)