In [2]:
import pandas as pd
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

In [3]:
def did2(df_parent,lower, upper, final_grade,  initial_grade, dummy_col, 
         treat_total, treat_normalized_new, num_L, num_U):
    
    """
    This function carries out difference-in-differences as a function of number of ADU attendance and range of initial grades.
    
    1. It first requests what number of ADU attendance, num_attend, do you want to look at?  It then looks for that number in 
    the total ADU column and populates a new column (treat_normalized_new) with 1s where num_attend was found in Total ADU.
    
    2.  Then, based on this DF, it extracts,all the observations between 
    a range of grades, lower and upper.  This is df.
    
    3.  Now that it has a subset DF with new T/C new column,  it then extracts a new DF that includes Total ADu ==0 (control) and
    T/C new ==1.  This then creates a DF with the same control group each time we change the num_attend.  (We don't want the control g
    group changing every time we change the treatment number.)
    
    4.  Now that we have our final DF, we need to get it into a form for calculating the DID.  We take the updated DF and then
    subset to include only PHYS1015A, Mid-Year, T/C for example.   Takes in df of [Final Grade, Initial Grade, Treat/Control].  It then
    rearranges the DF so that it is in the form to carry out difference-in-differences.
    
    It outputs the DID coefficient plus the p-value.
    
    imports:
    
    import statsmodels.api as sm
    import pandas as pd 
    
    
    dummy_col = what are we controlling for?  Grades, or attendance?  If grades, then this will be Mid-Year.
    If attendance, this will be Perc Videos (for video watching) or Total Attendance (for ADU tuts)
    
    attend = number of ADU sessions attended"""
    ## new section that modifies the T/C column
    
    ## STEP 1:  CHOOSE NUM_ATTEND AS THOSE THAT ARE IN THE TREATMENT GROUP
    df_parent[treat_normalized_new] = 0   # create new column populated with zeros
    for k in df_parent[treat_total].index:
        if dfTest1['2nd Sem'][k] >= num_L and dfTest1['2nd Sem'][k] < num_U:   # this line allows us to choose a range of 
            ## adu attendance.  You could just do 1 and 2 say, if you just wanted 1.  
            df_parent[treat_normalized_new][k]= 1
            
    ## end of section that modifies T/C column
    
    
    ## STEP 2:  CHOOSE THE RANGE OF INITIAL GRADE
    df = df_parent[(df_parent[dummy_col]>lower) & (df_parent[dummy_col] <=upper) ]  # if controlling for grades
    #df = df_parent[(df_parent[dummy_col]>lower) & (df_parent[dummy_col] <=upper) | (df_parent[dummy_col] == 0)]  # if controlling
    # for video watching.  There needs to be a range of video watching plus the control 
    #df = df_parent[(df_parent[dummy_col]== attend) | (df_parent[dummy_col]==0)]
    
    
    ## STEP 3:  SUBSET THE DF TO INCLUDE THE SAME CONTROL GROUP AND THE NEW TREAT GROUP
    df = df[(df[treat_total] == 0) | (df[treat_normalized_new] == 1)]  ## important code.  Exract final df that includes
    ## the same control group every time
    
    ## STEP 4:  EXTRACT ONLY THE THREE COLUMNS WE NEED FOR DID
    
    df = df.loc[:,[final_grade, initial_grade, treat_normalized_new]].reset_index(drop = True)
    
    #1. read in file df and place initial grades below final grades.  
    # i.e place all grades in single column
    
    ## STEP 5: REARRANGE THE INITIAL GRADE TO FIT UNDER THE FINAL GRADES SO THAT WE HAVE A SINGLE COLUMN OF GRADES
    df_2 = df.iloc[:,0].append(df.iloc[:,1]).reset_index(drop= True)
    # Need to convert to DataFrame
    df_3 = pd.DataFrame(df_2)
    
    # creates new column PrePost populated with 1s (indicating final time).
    
    ## STEP 6:  CREATE NEW COLUMN CALLED PREPOST THAT IS 1 IF FINAL GRADE AND 0 IF INITIAL GRADE
    df_3['PrePost'] = 1  
    # populate the bottom half with 0s (indicating initial time)
    for k in df.index:
        df_3['PrePost'][k+len(df)] = 0
        
        
    # create T/C column and then duplicate it below itself   
    ## STEP 7:  CREATE T/C COLUMN THEN DUPLICATE IT BELOW ITSELF
    df_3['T/C'] = df.iloc[:,2]
    for kk in df.index:
        df_3['T/C'][kk+len(df)] = df.iloc[:,2][kk]   
    
    # Create DID column = T/C*PrePost
    
    ## STEP 8:  CREATE THE DID COLUMN THAT IS THE INTERACTION VALUE
    df_3['DID'] = df_3['T/C']*df_3['PrePost']
    
    # Rename all the columns
    ## STEP 9:  RENAME ALL THE COLUMNS
    df_3.columns = ['Grades','PrePost','T/C','DID']
    
    treat_num = df_3['T/C'].sum()/2
    
   ## STEP 10:  CARRY OUT REGRESSION USING STATSMODELS.API
    X = df_3[['PrePost','T/C','DID']]
    y= df_3['Grades']
    X = sm.add_constant(X)
    model = sm.OLS(y,X).fit()
    model.predict(X)
    print_model = model.summary()
    
    return print(print_model), print('Treat number:' ,treat_num), print('ADU attendance:',num_L, 'to:', num_U-1)