In [1]:
import numpy as np
import pandas as pd

In [2]:
def SetMajor(df):
    df['Q6b'] = df['Q6b_y'].fillna(df['Q6b_x'])
    df['Q6b.i'] = df['Q6b.i_y'].fillna(df['Q6b.i_x'])
    
    conditions = [
        df['Q6b'] < 4,
        df['Q6b.i'] < 5
    ]
    
    output = [
        'Physics', 'Non-physics'
    ]
    
    df['Major'] = np.select(conditions, output, None)
    return df

def SetGender(df):
    conditions = [
        df['Q6e_1_y'] == 1,
        df['Q6e_2_y'] == 1,
        df['Q6e_1_x'] == 1,
        df['Q6e_2_x'] == 1
    ]

    output = [
        'Male', 'Female'
    ]

    df['Gender'] = np.select(conditions, np.tile(output, 2), None)
    return df

def SetURM(df):
    conditions = [
        df['Q6f_5_y'] == 1,
        df['Q6f_1_y'] == 1,
        df['Q6f_7_y'] == 1,
        df['Q6f_3_y'] == 1,
        df['Q6f_4_y'] == 1,
        df['Q6f_2_y'] == 1,
        df['Q6f_6_y'] == 1,    
        df['Q6f_5_x'] == 1,
        df['Q6f_1_x'] == 1,
        df['Q6f_7_x'] == 1,
        df['Q6f_3_x'] == 1,
        df['Q6f_4_x'] == 1,
        df['Q6f_2_x'] == 1,
        df['Q6f_6_x'] == 1,
    ]

    output = ['URM'] * 5 + ['Majority'] * 2

    df['URM_Status'] = np.select(conditions, np.tile(output, 2), None) 
    return df

def SetClassStanding(df):
    conditions = [
        df['Q6a_y'] == 1,
        df['Q6a_y'] > 1,
        df['Q6a_x'] == 1,
        df['Q6a_x'] > 1,
    ]

    output = ['Freshman', 'Beyond-first-year']

    df['Class_Standing'] = np.select(conditions, np.tile(output, 2), None) 
    return df

In [3]:
df = pd.read_csv('Collective_Surveys/Complete/Complete_Concat_CourseInfo.csv')

df = SetMajor(df)
df = SetGender(df)
df = SetURM(df)
df = SetClassStanding(df)
# cols = [c for c in df.columns if 'Q1' in c or 'Q2' in c or 'Q3' in c or 'Q4' in c] + ['Major', 'Gender', 'URM_Status', 
#                                                                                       'Class_Standing', 'Survey_x', 'Survey_y',
#                                                                                       'Class_ID', 'Course_Level', 
#                                                                                       'PreScores', 'PostScores']
# df = df[cols]

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df['Matched'] = df[['Survey_x', 'Survey_y']].transform(lambda x: pd.notnull(x['Survey_x']) * pd.notnull(x['Survey_y']), axis = 1)
df['N_Matched'] = df.groupby('Class_ID')['Matched'].transform(np.sum)

df['PreOnly'] = df[['Survey_x', 'Survey_y']].transform(lambda x: pd.notnull(x['Survey_x']) * pd.isnull(x['Survey_y']), axis = 1)
df['N_PreOnly'] = df.groupby('Class_ID')['PreOnly'].transform(np.sum)

df['PostOnly'] = df[['Survey_x', 'Survey_y']].transform(lambda x: pd.isnull(x['Survey_x']) * pd.notnull(x['Survey_y']), axis = 1)
df['N_PostOnly'] = df.groupby('Class_ID')['PostOnly'].transform(np.sum)

df['Download_Available'] = df[['N_PreOnly', 'N_PostOnly', 'N_Matched']].transform(lambda x: (x['N_PreOnly'] != 1) * 
                                                                                  (x['N_PostOnly'] != 1) * 
                                                                                  (x['N_Matched'] != 1), axis = 1)

df['Matched_Available'] = df['N_Matched'] > 1
df['Valid_Available'] = (df['N_PreOnly'] + df['N_Matched'] != 1) & (df['N_PostOnly'] + df['N_Matched'] != 1)
df = df.drop(columns = ['PreOnly', 'PostOnly', 'Matched', 'N_PreOnly', 'N_PostOnly', 'N_Matched'])

df_preOnly = df.loc[pd.notnull(df['Survey_x']) & pd.isnull(df['Survey_y']), :]
df_postOnly = df.loc[pd.isnull(df['Survey_x']) & pd.notnull(df['Survey_y']), :]
df_matchOnly = df.loc[pd.notnull(df['Survey_x']) & pd.notnull(df['Survey_y']), :]

len(df_preOnly) + len(df_postOnly) + len(df_matchOnly) == len(df)

True

In [5]:
def CrossTab(df):
    df_tab = df.loc[:, ['Class_ID', 'Gender', 'URM_Status', 'Major', 'Class_Standing']]

    tab = pd.crosstab(df_tab['Class_ID'], [df_tab['Gender'], df_tab['URM_Status'], df_tab['Major'], df_tab['Class_Standing']])
    tab['Class_Standing_Available'] = [False if 1 in row else True for row in tab.values.tolist()]
    tab['Major_Available'] = [False if 1 in row else True for row in tab.values.tolist()]
    tab['URM_Available'] = [False if 1 in row else True for row in tab.values.tolist()]
    tab['Gender_Available'] = [False if 1 in row else True for row in tab.values.tolist()]

    for ID, Row in tab.iterrows():
        if((Row['Class_Standing_Available'] * Row['Major_Available'] * Row['URM_Available'] * 
            Row['Gender_Available']).values[0]):
            continue
        if(not Row['Class_Standing_Available'].values[0]):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Gender'], [df_class['URM_Status'], df_class['Major']])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Major_Available', 'URM_Available', 'Gender_Available']] = True
                continue
        if(not Row['Major_Available'].values[0]):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Gender'], [df_class['URM_Status'], df_class['Class_Standing']])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Class_Standing_Available', 'URM_Available', 'Gender_Available']] = True
                continue
        if(not Row['URM_Available'].values[0]):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Gender'], [df_class['Major'], df_class['Class_Standing']])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Class_Standing_Available', 'Major_Available', 'Gender_Available']] = True
                continue
        elif(not Row['Gender_Available'].values[0]):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['URM_Status'], [df_class['Major_Status'], df_class['Class_Standing']])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Class_Standing_Available', 'Major_Available', 'URM_Available']] = True
                continue
        if((not Row['Class_Standing_Available'].values[0]) * (not Row['Major_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Gender'], df_class['URM_Status'])
            if(1 not in tab_class.values):
                tab.loc[ID, ['URM_Available', 'Gender_Available']] = True
                continue
        if((not Row['Class_Standing_Available'].values[0]) * (not Row['URM_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Gender'], df_class['Major'])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Major_Available', 'Gender_Available']] = True
                continue
        elif((not Row['Class_Standing_Available'].values[0]) * (not Row['Gender_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['URM_Status'], df_class['Major'])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Major_Available', 'URM_Available']] = True
                continue
        if((not Row['Major_Available'].values[0]) * (not Row['URM_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Gender'], df_class['Class_Standing'])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Class_Standing_Available', 'Gender_Available']] = True
                continue
        if((not Row['Major_Available'].values[0]) * (not Row['Gender_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['URM_Status'], df_class['Class_Standing'])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Class_Standing_Available', 'URM_Available']] = True
                continue
        if((not Row['URM_Available'].values[0]) * (not Row['Gender_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            tab_class = pd.crosstab(df_class['Major'], df_class['Class_Standing'])
            if(1 not in tab_class.values):
                tab.loc[ID, ['Class_Standing_Available', 'Major_Available']] = True
                continue
        if((not Row['Class_Standing_Available'].values[0]) * (not Row['Major_Available'].values[0]) * 
             (not Row['URM_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            if(1 not in df_class['Gender'].value_counts().values):
                tab.loc[ID, 'Gender_Available'] = True
                continue
        if((not Row['Class_Standing_Available'].values[0]) * (not Row['Major_Available'].values[0]) * 
             (not Row['Gender_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            if(1 not in df_class['URM_Status'].value_counts().values):
                tab.loc[ID, 'URM_Available'] = True
                continue
        if((not Row['Class_Standing_Available'].values[0]) * (not Row['URM_Available'].values[0]) * 
             (not Row['Gender_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            if(1 not in df_class['Major'].value_counts().values):
                tab.loc[ID, 'Major_Available'] = True
                continue
        if((not Row['Major_Available'].values[0]) * (not Row['URM_Available'].values[0]) * 
             (not Row['Gender_Available'].values[0])):
            df_class = df_tab.loc[df['Class_ID'] == ID, :]
            if(1 not in df_class['Class_Standing'].value_counts().values):
                tab.loc[ID, 'Class_Standing_Available'] = True
                continue

    tab = tab[['Class_Standing_Available', 'Major_Available', 'URM_Available', 'Gender_Available']].reset_index()
    tab.columns = tab.columns.droplevel(level = [1, 2,3])
    df_out = df.merge(tab, how = 'left', on = 'Class_ID')
    return(df_out)

In [6]:
df_preOnlyTab = CrossTab(df_preOnly)
df_postOnlyTab = CrossTab(df_postOnly)
df_matchTab = CrossTab(df_matchOnly)
df_Tab = pd.concat([df_preOnlyTab, df_postOnlyTab, df_matchTab], axis = 0, join = 'outer').reset_index(drop = True)

df_Tab['Gender_Available_Download'] = (df_Tab.groupby('Class_ID')['Gender_Available'].transform('nunique') == 1) & \
(df_Tab['Gender_Available'] == True)
df_Tab['URM_Available_Download'] = (df_Tab.groupby('Class_ID')['URM_Available'].transform('nunique') == 1) & \
(df_Tab['URM_Available'] == True)
df_Tab['Major_Available_Download'] = (df_Tab.groupby('Class_ID')['Major_Available'].transform('nunique') == 1) & \
(df_Tab['Major_Available'] == True)
df_Tab['Class_Standing_Available_Download'] = (df_Tab.groupby('Class_ID')['Class_Standing_Available'].transform('nunique') == 1) & \
(df_Tab['Class_Standing_Available'] == True)

df_Tab = df_Tab.drop(columns = ['Gender_Available', 'URM_Available', 'Major_Available', 'Class_Standing_Available'])

In [13]:
def CountValues(df):

    df_Tab = pd.crosstab(df['Class_ID'], df['Gender'])
    df_Tab['Gender_Available'] = [False if 1 in v else True for v in df_Tab.values.tolist()]

    df_Tab_dummy = pd.crosstab(df['Class_ID'], df['URM_Status'])
    df_Tab_dummy['URM_Available'] = [False if 1 in v else True for v in df_Tab_dummy.values.tolist()]
    df_Tab = df_Tab.merge(df_Tab_dummy, how = 'left', on = 'Class_ID')

    df_Tab_dummy = pd.crosstab(df['Class_ID'], df['Major'])
    df_Tab_dummy['Major_Available'] = [False if 1 in v else True for v in df_Tab_dummy.values.tolist()]
    df_Tab = df_Tab.merge(df_Tab_dummy, how = 'left', on = 'Class_ID')

    df_Tab_dummy = pd.crosstab(df['Class_ID'], df['Class_Standing'])
    df_Tab_dummy['Class_Standing_Available'] = [False if 1 in v else True for v in df_Tab_dummy.values.tolist()]
    df_Tab = df_Tab.merge(df_Tab_dummy, how = 'left', on = 'Class_ID')

    print(df_Tab)
    df_Tab = df_Tab[['Gender_Available', 'URM_Available', 'Major_Available', 'Class_Standing_Available']].reset_index()
    return df_Tab
    

In [14]:
df_pre = df_Tab.loc[pd.notnull(df_Tab['Survey_x']), :]
df_Final = df_Tab.merge(CountValues(df_pre), how ='left', 
                        on = 'Class_ID').rename(columns = {'Gender_Available':'Gender_Available_Pre', 
                                                           'URM_Available':'URM_Available_Pre',
                                                           'Major_Available':'Major_Available_Pre',
                                                           'Class_Standing_Available':'Class_Standing_Available_Pre'})

df_post = df_Tab.loc[pd.notnull(df_Tab['Survey_y']), :]
df_Final = df_Final.merge(CountValues(df_post), how ='left',
                        on = 'Class_ID').rename(columns = {'Gender_Available':'Gender_Available_Post', 
                                                           'URM_Available':'URM_Available_Post',
                                                           'Major_Available':'Major_Available_Post',
                                                           'Class_Standing_Available':'Class_Standing_Available_Post'})

df_Final = df_Final.merge(CountValues(df_matchOnly), how ='left',
                        on = 'Class_ID').rename(columns = {'Gender_Available':'Gender_Available_Match', 
                                                           'URM_Available':'URM_Available_Match',
                                                           'Major_Available':'Major_Available_Match',
                                                           'Class_Standing_Available':'Class_Standing_Available_Match'})

df_Final['Gender_Available_Valid'] = df_Final['Gender_Available_Pre'].fillna(True) & df_Final['Gender_Available_Post'].fillna(True)
df_Final['URM_Available_Valid'] = df_Final['URM_Available_Pre'].fillna(True) & df_Final['URM_Available_Post'].fillna(True)
df_Final['Major_Available_Valid'] = df_Final['Major_Available_Pre'].fillna(True) & df_Final['Major_Available_Post'].fillna(True)
df_Final['Class_Standing_Available_Valid'] = df_Final['Class_Standing_Available_Pre'].fillna(True) & df_Final['Class_Standing_Available_Post'].fillna(True)
df_Final = df_Final.drop(columns = ['Gender_Available_Pre', 'Gender_Available_Post', 'URM_Available_Pre', 'URM_Available_Post', 
                                    'Major_Available_Pre', 'Major_Available_Post', 'Class_Standing_Available_Pre', 
                                    'Class_Standing_Available_Post'])

                   Female  Male  Gender_Available  Majority  URM  \
Class_ID                                                           
R_12QFe4VQPh6oNW1       5    15              True        16    4   
R_12t10i0liTKxQuH      12    15              True        13   14   
R_1DA4MjYI6kJcpRW       4    19              True        20    3   
R_1E6g9AAGCZcNC4z       7    10              True        15    1   
R_1IB300CxBKh0Tw7      28    62              True        72   16   
R_1LHvn3R5Afj8eUc      70    46              True        87   24   
R_1Lv8h75V2mtHdj7       3    17              True        19    1   
R_1Oko8BpPfb9rt0G      18    79              True        87    9   
R_1PRvAep4UEZKmoi       3    14              True        12    5   
R_1QDH836nn3XsCEC       3    13              True        15    1   
R_1dHrngC5fnItWQj       1    17             False        18    0   
R_1eRcOhZpre7mhcC      35    54              True        69   15   
R_1gU9RGAQf2fpKPq       1     9             Fals

                   Female  Male  Gender_Available  Majority  URM  \
Class_ID                                                           
R_12QFe4VQPh6oNW1       3     4              True         6    1   
R_12t10i0liTKxQuH      28    24              True        25   27   
R_1DA4MjYI6kJcpRW       5    18              True        21    2   
R_1E6g9AAGCZcNC4z       2     2              True         4    0   
R_1IB300CxBKh0Tw7      24    54              True        64   12   
R_1LHvn3R5Afj8eUc      52    36              True        66   20   
R_1Lv8h75V2mtHdj7       2    11              True        13    0   
R_1OMRnfQCygrVfk1      41   108              True       138    8   
R_1Oko8BpPfb9rt0G      15    75              True        81    8   
R_1PRvAep4UEZKmoi       1    14             False        11    3   
R_1QDH836nn3XsCEC       1    23             False        22    2   
R_1dHrngC5fnItWQj       1    13             False        14    0   
R_1eRcOhZpre7mhcC      19    25              Tru

                   Female  Male  Gender_Available  Majority  URM  \
Class_ID                                                           
R_12QFe4VQPh6oNW1       3     4              True         6    1   
R_12t10i0liTKxQuH      10    12              True        10   12   
R_1DA4MjYI6kJcpRW       2    14              True        15    1   
R_1E6g9AAGCZcNC4z       2     2              True         4    0   
R_1IB300CxBKh0Tw7      23    48              True        57   12   
R_1LHvn3R5Afj8eUc      48    34              True        62   18   
R_1Lv8h75V2mtHdj7       2    10              True        12    0   
R_1Oko8BpPfb9rt0G      14    70              True        76    7   
R_1PRvAep4UEZKmoi       1    12             False        10    3   
R_1QDH836nn3XsCEC       1     9             False         9    1   
R_1dHrngC5fnItWQj       1    10             False        11    0   
R_1eRcOhZpre7mhcC      10    14              True        19    4   
R_1gU9RGAQf2fpKPq       1     7             Fals

In [15]:
df_Final.to_csv('Collective_Surveys/Complete/Complete_Concat_DeIdentified.csv', index = False)