In [1]:
#import some common things
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [2]:
##################################################
#                            PARAMETERS TO BE SET                                               #
##################################################

# Declare relative directory paths and what they represent
gl_rel_dir = "../python_scraper/geospatialData/"
geospatial_rel_files = {"FA16" : gl_rel_dir+"FA16-geospatial.csv",
                                        "FA17" : gl_rel_dir+"FA17-geospatial.csv",
                                        "SP17" : gl_rel_dir+"SP17-geospatial.csv",
                                        "WI17" : gl_rel_dir+"WI17-geospatial.csv"}
# summer data not used currently
geospatial_sum_files = {"S117" : gl_rel_dir+"S117-geospatial.csv",
                                            "S217" : gl_rel_dir+"S217-geospatial.csv",
                                            "S317" : gl_rel_dir+"S317-geospatial.csv"}

MIN_STU_AVG = 30        # defines minimum average allowable for classrooms to be considered
MIN_CLASS_SIZE = 1   # without this many students minimum, it's not technically a class
IGNORE_LIST = ["RECGM", "NIERM", "MCC"]           # ignore things outside of main UCSD

In [3]:
# Now concatenate them all in a meaningful way
quarters_pd = pd.concat([pd.read_csv(loc) for loc in geospatial_rel_files.values()], 
                                keys = geospatial_rel_files.keys())

# classes with less the MIN students will not be considered
quarters_pd.drop(quarters_pd[quarters_pd.students_number < MIN_CLASS_SIZE].index, inplace=True)
quarters_pd

Unnamed: 0,Unnamed: 1,building_tag,longitude,latitude,day,hour,students_number
FA16,0,YORK,32.874554,-117.240005,M,12,113
FA16,2,SERF,32.879664,-117.235105,Th,13,4
FA16,3,VAUGN,32.865324,-117.252961,F,14,9
FA16,4,BONN,32.875415,-117.240332,Th,11,19
FA16,5,CENTR,32.877773,-117.237262,W,12,1405
FA16,6,SME,32.879890,-117.233139,M,14,46
FA16,7,WLH,32.880558,-117.234319,Tu,19,331
FA16,8,VAUGN,32.865324,-117.252961,Tu,9,10
FA16,9,EBU2,32.881243,-117.233615,W,17,24
FA16,10,CPMC,32.877962,-117.234429,Tu,9,157


In [4]:
# group the data by building
quarters_grouped = quarters_pd.groupby("building_tag")

In [5]:
# now remove the buildings whose average number of students is too low (outliers)
dat_tupled = list()
z_dict = dict()
for groups in quarters_grouped.groups:
    curr_avg = quarters_grouped.get_group(groups).students_number.mean()
    if curr_avg >= MIN_STU_AVG and groups not in IGNORE_LIST:            # if average not above threshold don't use it
        curr_std = quarters_grouped.get_group(groups).students_number.std()
        dat = quarters_grouped.get_group(groups).students_number
        dat_tupled.append((groups, curr_avg, curr_std, dat))
        
        # now evalute z scores and append to dictionary
        z_scores=(quarters_grouped.get_group(groups).students_number-curr_avg)/curr_std
        p_values = scipy.stats.norm.sf(min(z_scores)+max(z_scores))
        z_dict.setdefault(groups, [min(z_scores), max(z_scores), curr_avg, curr_std, p_values  ])
        
# sort resultant data
dat_tupled = sorted(dat_tupled, key=lambda x:x[1], reverse=True)
        
# unzips tuples made above for possible future usage
labels, averages, stdevs, dats= zip(*dat_tupled)

# makes Z value range dictionary neater
z_ranges = pd.DataFrame.from_dict(z_dict, orient='index')
z_ranges.columns = ['Zmin', 'Zmax']
z_ranges

NameError: name 'scipy' is not defined

In [None]:
# customization elements
medianprops = dict(linestyle='-.', linewidth=2.5, color='darkmagenta')
meanlineprops = dict(linestyle='-', linewidth=2.5, color='red')

# Now plot using a common box plot with soft colors
fig1, ax1 = plt.subplots(figsize=(19.5,15))
bplot = ax1.boxplot(dats, vert = True, patch_artist = True, showmeans=True, notch=True,
                    meanline=True, medianprops=medianprops, meanprops=meanlineprops, whis='range')
for bplots in bplot:
    for patch in bplot['boxes']:
        patch.set_facecolor('aliceblue')
    
# sets custom labels and titles and such
ax1.set_xticklabels(labels, rotation=45, ha='right')
ax1.set_title("Distribution of Students Across Various UCSD Buildings\n (During the Normal School Year)", fontsize = 25)
ax1.yaxis.grid(True)
ax1.set_ylabel("# of Students")

# resizes the text in the display
for item in ([ax1.xaxis.label, ax1.yaxis.label] + ax1.get_xticklabels() + ax1.get_yticklabels()):
    item.set_fontsize(18)

# remove spines and set limit
ax1.spines["top"].set_visible(False)   
ax1.spines["right"].set_visible(False)
ax1.spines["left"].set_visible(False)
ax1.set_ylim(0, )
fig1.show()