In [None]:
import pandas as pd
import plotly.express as px

In [None]:
df_orig = pd.read_csv('./servey-1.csv')

In [None]:
df_orig.info()

In [None]:
def pipe_rename(df: pd.DataFrame) -> pd.DataFrame:
    labels = {
        "Are you a student?": "education_level",
        "What's the name of your school? (Ex. University of Maryland)": "school_name",
        "How many teachers / professors do you have?": "num_professors",
        "How many of your teachers / professors record their lectures for students?": "num_professors_record",
        "For the professors who record lectures (if any), how often do they record?": "record_interval",
        "How often do you use websites to speed up your work as a student (ex. easybib or sparknotes)?": "online_assistance_freq",
        "(Optional) Do you have any diagnosed learning disabilities?": "has_disability",
        "(Optional) Add any additional comments below.": "comments"
    }
    return df.rename(mapper=labels, axis=1)

def pipe_standard_schools(df: pd.DataFrame) -> pd.DataFrame:
    df_c = df.copy(deep=True)
    def fix_school_names(s: str) -> str:
        if s.lower().strip() in ['gcc', 'grove city college', 'grove', 'grove city']:
            return "Grove City College"
        else:
            return s.strip().title()
    df_c['school_name'] = df_c['school_name'].apply(fix_school_names)
    return df_c

def pipe_professor_record(df: pd.DataFrame) -> pd.DataFrame:
    df_c = df.copy(deep=True)
    df_c['num_professors_record'] = df_c.apply(lambda x: x['num_professors'] if x['num_professors_record'] == "All" else x['num_professors_record'], axis=1)
    df_c['num_professors_record'] = df_c['num_professors_record'].astype(float).fillna(0).astype(int)
    return df_c

def pipe_professor_record_ratio(df: pd.DataFrame) -> pd.DataFrame:
    df_c = df.copy(deep=True)
    df_c['professor_record_ratio'] = df_c['num_professors_record'] / df_c['num_professors']
    return df_c

def pipe_record_interval_ratio(df: pd.DataFrame) -> pd.DataFrame:
    df_c = df.copy(deep=True)
    df_c['record_interval'] = df_c['record_interval'].fillna("")
    likert_record_interval = [
        None,
        'Rarely',
        'Always',
        'Always',
        'Sometimes',
        'Always',
        'Rarely',
        'Always',
        'Always',
        None,
        'Never',
        'Rarely',
        'Sometimes',
        'Rarely',
        'Rarely',
        None,
        'Rarely',
        'Often',
        'Always',
        None,
        None,
        None,
        None,
        'Always',
        None,
        'Always',
        'Always',
        'Often',
        'Rarely',
        None,
        None,
        'Rarely',
        'Rarely',
        None,
        None,
        'Always',
        'Sometimes',
        'Rarely',
        'Rarely',
        None,
        'Always'
    ]
    df_c['likert_record_interval'] = likert_record_interval
    return df_c

def fix_num_professors_record(df: pd.DataFrame) -> pd.DataFrame:
    if 'professor_record_ratio' not in df.keys() or 'likert_record_interval' not in df.keys():
        raise Exception("Missing required fields {}, {}".format('professor_record_ratio', 'likert_record_interval'))
    df_c = df.copy(deep=True)
    # You can't have more professors recording lectures than you have professors.
    def fix_num_professors_record(record: pd.Series) -> int:
        if record['professor_record_ratio'] > 1:
            if record['likert_record_interval'].lower() != 'never':
                return record['num_professors'] 
            else:
                return 0
        else:
            return record['num_professors_record']
    df_c['num_professors_record'] = df_c.apply(fix_num_professors_record, axis=1)
    return df_c

In [None]:
df_orig.info()

In [None]:
df = (
    df_orig.pipe(pipe_rename)
        .pipe(pipe_standard_schools)
        .pipe(pipe_professor_record)
        .pipe(pipe_record_interval_ratio)
        .pipe(pipe_professor_record_ratio)
        .pipe(fix_num_professors_record)
        .pipe(pipe_professor_record_ratio)
)
df.info()

In [None]:
fig = px.pie(df, 'likert_record_interval', title="How often do professors record their lectures?", hover_data=['likert_record_interval'])
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
px.histogram(df, x='school_name', title="School name")

In [None]:
bins = [-1, 0, 0.20, 0.40, 0.6, 0.8, 1]
labels = ['0%', '1%-20%', '20%-40%', '40%-60%', '60%-80%', '80%-100%']
df['professor_record_bins'] = pd.cut(df['professor_record_ratio'], bins=bins, labels=labels)

The below graph is based on the ratio of the number professors each student said recorded lectures divided by the number of professors each student said they had. 
Thus, if you have only one professor but they always record their lectures, your professor_record_ratio is 1.0 (or 100%). If you have 5 professors and 3 of them record, your ratio is 0.6.

In [None]:
px.histogram(df, x='professor_record_bins', 
             title='Number of Students that Say X Percent of Their Professors Record', 
             labels={'professor_record_bins': 'Percent of professors that record lectures'}).update_xaxes(categoryorder='category ascending')

In [None]:
ppr = df['num_professors_record'].agg(sum) / df['num_professors'].agg(sum)
print(f"The percent of professors who record at all based on the survey is {ppr*100:0.4}%")