In [1]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import copy
import os
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import HyperparameterTuner
import copy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import math
import sys
import statistics
import time
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

#Remove all current files in the sagemaker instance
current_directory=os.getcwd()
files = os.listdir(current_directory)
for file in files:
    if file.endswith('.csv'):
        print(file)
        os.remove(os.path.join(current_directory, file))
        
# Create an S3 client object
s3_client= boto3.client('s3')

#Delete all files in the output bucket
bucket_name = 'higley-output-bucket'
objects = s3_client.list_objects_v2(Bucket=bucket_name)
if 'Contents' in objects:
    objects_to_delete = [{'Key': obj['Key']} for obj in objects['Contents']]
    s3_client.delete_objects(Bucket=bucket_name, Delete={'Objects': objects_to_delete})
    print("All files deleted successfully.")
else:
    print("No files found in the bucket.")

output_bucket_name='higley-input-bucket'
prefix = 'Artifacts'
key = 'XGBoost-Regressor'
current_path='/home/ec2-user/SageMaker/'
Training_file='train.csv'
Validation_file='validation.csv'
testing_size=0.15
validation_size=0.5
bucket_name='higley-input-bucket'

#Function to download files to the sagemaker instance Path
def download_file(file_name):
    try:
        path='/home/ec2-user/SageMaker/'+file_name
        s3_client.download_file(bucket_name, file_name,path)
        return path
        
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise
            
#Download New Intakes By School Dataset
files_path=download_file('new_intakes_by_school_raw.csv')
intakes=pd.read_csv(files_path)
intakes=intakes.rename(columns={'Year':'YEAR'})

#Label Encoding School Names 
le = LabelEncoder()
intakes_schools=list(intakes['School'])
intakes['School'] = le.fit_transform(intakes['School'])
school_map={}
encoded_schools=list(intakes['School'])
for i in range(len(encoded_schools)):
    if encoded_schools[i] not in school_map:
        school_map[encoded_schools[i]]=intakes_schools[i]
    else:
        pass
intakes['GRADE']=0
for index,row in intakes.iterrows():
    grade=intakes['Grade'][index]
    if grade=='PS':
        intakes['GRADE'][index]=13
    elif grade=='KG':
        intakes['GRADE'][index]=14
    else:
        intakes['GRADE'][index]=int(intakes['Grade'][index])
        
intakes=intakes.drop('Grade',axis=1)
data=copy.deepcopy(intakes)
unique_years=sorted(data['YEAR'].unique())
latest_year=max(unique_years)
#Download Prediction Dataset
files_path=download_file('prediction_dataset.csv')
prediction=pd.read_csv(files_path)
prediction_data=data[data['YEAR']==latest_year]
prediction_data=prediction_data.drop(['YEAR','New_Intake'],axis=1)

next_year=latest_year+1
#Download Non-NDA Dataset and get highly correlating parameters to train the model
non_nda_files_path=download_file('non_nda_dataset.csv')
non_nda_data=pd.read_csv(non_nda_files_path)
correlation_matrix = non_nda_data.corr()
column_of_interest = 'Enrollments_Count'
columns_above_threshold = correlation_matrix.loc[:, correlation_matrix[column_of_interest] > 0.7].columns.tolist()
columns_above_threshold.remove('Year')
columns_above_threshold.remove('Enrollments_Count')
print(columns_above_threshold)
for column_name in columns_above_threshold:
    prediction_data[column_name]=list(prediction[prediction['Year']==next_year][column_name])[0]
values_list=[]
for column_name in columns_above_threshold:
    values_list.append(list(prediction[prediction['Year']==next_year][column_name])[0])
    
unique_years=sorted(data['YEAR'].unique())
print(unique_years)
print(values_list)

#Model to compute New Intakes - Model 3 as per Bucket Enrollment Prediction Model
years_list={}
for year in unique_years:
    
    modified_data=copy.deepcopy(data)
    train_data = modified_data[modified_data['YEAR']==year].drop('YEAR',axis=1)
    test_data = modified_data[modified_data['YEAR']!=year].drop('YEAR',axis=1)
    
    X_train = train_data.drop(columns =['New_Intake'])
    y_train = train_data['New_Intake']
    
    X_test = test_data.drop(columns =['New_Intake'])
    y_test = test_data['New_Intake']
    
    model = XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=3, learning_rate=0.1, colsample_bytree=0.7, subsample=0.7)

    model.fit(X_train, y_train)
    
    predict = model.predict(X_test)
    
    if year not in years_list:
        years_list[year]={}
        
    print('Training')
    predict_train = model.predict(X_train)
    years_list[year]['TrainAccuracy'] = mean_squared_error(y_train, predict_train)
    
    print('Test')
    years_list[year]['TestAccuracy'] = mean_squared_error(y_test, predict)
    
df=pd.DataFrame(years_list).transpose()
df_sorted = df.sort_values('TestAccuracy', ascending=True)

testing_count=len(unique_years)//4
training_count=len(unique_years)-testing_count

print(training_count,testing_count)

training_years=list(df_sorted.iloc[:training_count].index)
print(training_years)
testing_years=list(df_sorted.iloc[training_count:].index)
print(testing_years)

modified_data=copy.deepcopy(data)
train_data = modified_data[modified_data['YEAR'].isin(training_years)].drop('YEAR',axis=1)
test_data = modified_data[modified_data['YEAR'].isin(testing_years)].drop('YEAR',axis=1)
    
X_train = train_data.drop(columns =['New_Intake'])
y_train = train_data['New_Intake']
    
X_test = test_data.drop(columns =['New_Intake'])
y_test = test_data['New_Intake']
    
model = XGBRegressor(objective='reg:squarederror',n_estimators=100, max_depth=3, learning_rate=0.1, colsample_bytree=0.7, subsample=0.7)
model.fit(X_train, y_train)

print('Training')
predict_train = model.predict(X_train)
print("MSE = {}".format(mean_squared_error(y_train, predict_train)))

print('Testing')
predict = model.predict(X_test)
print("MSE = {}".format(mean_squared_error(y_test, predict)))

result=list(model.predict(prediction_data))
result=[math.ceil(num) if num > 0 else 0 for num in result]

prediction_data['New_Intake']=result

for index,row in prediction_data.iterrows():
    prediction_data['School'][index]=school_map[prediction_data['School'][index]]
for index,row in prediction_data.iterrows():
    grade=prediction_data['GRADE'][index]
    if grade==13:
        prediction_data['GRADE'][index]='PS'
    elif grade==14:
        prediction_data['GRADE'][index]='KG'
    else:
        prediction_data['GRADE'][index]=str(prediction_data['GRADE'][index])
#Storing the prediction result
prediction_data=prediction_data[['School','GRADE','New_Intake']]
target_path='/home/ec2-user/SageMaker/'+'Future_Intakes.csv'
prediction_data.to_csv(target_path,index=False)
#Uploading the result into S3 Bucket
target_file='Future_Intakes.csv'
target_path=current_path+target_file
upload_bucket='higley-input-bucket'
s3_client.upload_file(target_path, upload_bucket, target_file)

In [None]:
#Downloading the preprocessed data
files_path=download_file('new_data.csv')
data=pd.read_csv(files_path)
data=data.drop('ACTIVITY',axis=1)

#Repeat Schools
data['REPEAT_SCHOOL']=0
for index,row in data.iterrows():
        person_name=data['PERSON_GU'][index]
        year=data['Year'][index]
        school_name=data['SCHOOL'][index]
        next_year=year+1
        student_data=data[data['PERSON_GU']==person_name]
        new_school=-1
        if next_year in list(student_data['Year']):
            student_data=student_data[student_data['Year']==next_year]
            new_school=list(student_data['SCHOOL'])[0]
        if school_name==new_school:
            data['REPEAT_SCHOOL'][index]=1

data_copy=copy.deepcopy(data)
prediction_data=data_copy[data_copy['Year']==latest_year]
modified_prediction=prediction_data.drop(['PERSON_GU','Year','REPEAT','REPEAT_SCHOOL'],axis=1)

unique_years=sorted(data['Year'].unique())
unique_years.pop()
print(unique_years)

#Model 1 as per Bucket Enrollment Prediction Model

#Model for Repeat Students
years_list={}
for year in unique_years:
    print(year)
    modified_data=data.drop(['PERSON_GU','REPEAT_SCHOOL'],axis=1)
    train_data = modified_data[modified_data['Year']==year].drop('Year',axis=1)
    test_data = modified_data[modified_data['Year']!=year].drop('Year',axis=1)
    X_train = train_data.drop(columns =['REPEAT'])
    y_train = train_data['REPEAT']
    X_test = test_data.drop(columns =['REPEAT'])
    y_test = test_data['REPEAT']
    model = XGBClassifier(learning_rate=0.01, n_estimators=500,max_depth=20)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    if year not in years_list:
        years_list[year]={}
    predict_train = model.predict(X_train)
    years_list[year]['TrainPrecision']=precision_score(y_train, predict_train)
    years_list[year]['TrainRecall']=recall_score(y_train, predict_train)
    years_list[year]['TrainAccuracy']=accuracy_score(y_train, predict_train)
    years_list[year]['TestPrecision']=precision_score(y_test, predict)
    years_list[year]['TestRecall']=recall_score(y_test, predict)
    years_list[year]['TestAccuracy']=accuracy_score(y_test, predict)
    
df=pd.DataFrame(years_list).transpose()
df_sorted = df.sort_values('TestAccuracy', ascending=False)
testing_count=len(unique_years)//4
training_count=len(unique_years)-testing_count
print(training_count,testing_count)
training_years=list(df_sorted.iloc[:training_count].index)
print(training_years)
testing_years=list(df_sorted.iloc[training_count:].index)
print(testing_years)
modified_data=data.drop(['PERSON_GU','REPEAT_SCHOOL'],axis=1)
train_data = modified_data[modified_data['Year'].isin(training_years)].drop('Year',axis=1)
test_data = modified_data[modified_data['Year'].isin(testing_years)].drop('Year',axis=1)
X_train = train_data.drop(columns =['REPEAT'])
y_train = train_data['REPEAT']
X_test = test_data.drop(columns =['REPEAT'])
y_test = test_data['REPEAT']
model = XGBClassifier(learning_rate=0.01, n_estimators=500,max_depth=20)
model.fit(X_train, y_train)
predict = model.predict(X_test)
predict_train = model.predict(X_train)
print('Training')
print("Precision = {}".format(precision_score(y_train, predict_train)))
print("Recall = {}".format(recall_score(y_train, predict_train)))
print("Accuracy = {}".format(accuracy_score(y_train, predict_train)))
print('Testing')
print("Precision = {}".format(precision_score(y_test, predict)))
print("Recall = {}".format(recall_score(y_test, predict)))
print("Accuracy = {}".format(accuracy_score(y_test, predict)))


#Model for Repeat Schools
predict = model.predict(modified_prediction)
prediction_data['REPEAT']=predict
years_list={}
for year in unique_years:
    print(year)
    modified_data=data.drop(['PERSON_GU','REPEAT'],axis=1)
    train_data = modified_data[modified_data['Year']==year].drop('Year',axis=1)
    test_data = modified_data[modified_data['Year']!=year].drop('Year',axis=1)
    X_train = train_data.drop(columns =['REPEAT_SCHOOL'])
    y_train = train_data['REPEAT_SCHOOL']
    X_test = test_data.drop(columns =['REPEAT_SCHOOL'])
    y_test = test_data['REPEAT_SCHOOL']
    model = XGBClassifier(learning_rate=0.01, n_estimators=500,max_depth=20)
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    if year not in years_list:
        years_list[year]={}
    predict_train = model.predict(X_train)
    years_list[year]['TrainPrecision']=precision_score(y_train, predict_train)
    years_list[year]['TrainRecall']=recall_score(y_train, predict_train)
    years_list[year]['TrainAccuracy']=accuracy_score(y_train, predict_train)
    years_list[year]['TestPrecision']=precision_score(y_test, predict)
    years_list[year]['TestRecall']=recall_score(y_test, predict)
    years_list[year]['TestAccuracy']=accuracy_score(y_test, predict)
df=pd.DataFrame(years_list).transpose()
df_sorted = df.sort_values('TestAccuracy', ascending=False)
testing_count=len(unique_years)//4
training_count=len(unique_years)-testing_count
print(training_count,testing_count)
training_years=list(df_sorted.iloc[:training_count].index)
print(training_years)
testing_years=list(df_sorted.iloc[training_count:].index)
print(testing_years)
modified_data=data.drop(['PERSON_GU','REPEAT'],axis=1)
train_data = modified_data[modified_data['Year'].isin(training_years)].drop('Year',axis=1)
test_data = modified_data[modified_data['Year'].isin(testing_years)].drop('Year',axis=1)
X_train = train_data.drop(columns =['REPEAT_SCHOOL'])
y_train = train_data['REPEAT_SCHOOL']
X_test = test_data.drop(columns =['REPEAT_SCHOOL'])
y_test = test_data['REPEAT_SCHOOL']
model = XGBClassifier(learning_rate=0.01, n_estimators=500,max_depth=20)
model.fit(X_train, y_train)
predict = model.predict(X_test)
predict_train = model.predict(X_train)
print('Training')
print("Precision = {}".format(precision_score(y_train, predict_train)))
print("Recall = {}".format(recall_score(y_train, predict_train)))
print("Accuracy = {}".format(accuracy_score(y_train, predict_train)))
print('Testing')
print("Precision = {}".format(precision_score(y_test, predict)))
print("Recall = {}".format(recall_score(y_test, predict)))
print("Accuracy = {}".format(accuracy_score(y_test, predict)))
predict = model.predict(modified_prediction)
prediction_data['REPEAT_SCHOOL']=predict
modified_prediction_data=prediction_data[['PERSON_GU','SCHOOL','GRADE','REPEAT','REPEAT_SCHOOL']]

#Model 2 as per Bucket Enrollment Prediction Model
#Download School Map Encoded
files_path=download_file('school_map.csv')
school_map=pd.read_csv(files_path)
enrollments=copy.deepcopy(data_copy)

#Compute the grade school pair
grade_school_pair={}
for index,row in enrollments.iterrows():
    school=enrollments['SCHOOL'][index]
    grade=enrollments['GRADE'][index]
    if grade not in grade_school_pair:
        grade_school_pair[grade]=[]
    if school not in grade_school_pair[grade]:
        grade_school_pair[grade].append(school)
        
result={}
for index,row in modified_prediction_data.iterrows():
    person_name=modified_prediction_data['PERSON_GU'][index]
    grade=modified_prediction_data['GRADE'][index]
    school=modified_prediction_data['SCHOOL'][index]
    repeat_school=modified_prediction_data['REPEAT_SCHOOL'][index]
    repeat=modified_prediction_data['REPEAT'][index]
    next_grade=grade+1
    if next_grade==15:
        next_grade=1
    if repeat==1 and grade!=12:
        school_name=school_map[str(school)][0]
        if repeat_school==1 and school in grade_school_pair[next_grade]:
            if school_name not in result:
                result[school_name]={}
            if grade not in result[school_name]:
                result[school_name][grade]=0
            result[school_name][grade]+=1
        else:
            ada=list(prediction_data[prediction_data['PERSON_GU']==person_name]['ADA'])[0]
            performance=list(prediction_data[prediction_data['PERSON_GU']==person_name]['PERFORMANCE'])[0]
            ethinicty=list(prediction_data[prediction_data['PERSON_GU']==person_name]['ETHNICITY_RACE'])[0]
            city=list(prediction_data[prediction_data['PERSON_GU']==person_name]['CITY'])[0]
            gender=list(prediction_data[prediction_data['PERSON_GU']==person_name]['GENDER'])[0]
            sped=list(prediction_data[prediction_data['PERSON_GU']==person_name]['SPED'])[0]
            frm=list(prediction_data[prediction_data['PERSON_GU']==person_name]['FRM'])[0]
            sample_data=copy.deepcopy(data_copy)
            sample_data=sample_data[sample_data['Year']!=latest_year]
            if sample_data[sample_data['REPEAT']==repeat].shape[0]!=0:
                sample_data=sample_data[sample_data['REPEAT']==repeat]
                if sample_data[sample_data['REPEAT_SCHOOL']==repeat_school].shape[0]!=0:
                    sample_data=sample_data[sample_data['REPEAT_SCHOOL']==repeat_school]
                    if sample_data[sample_data['GRADE']==grade].shape[0]!=0:
                        sample_data=sample_data[sample_data['GRADE']==grade]
                        if sample_data[sample_data['SCHOOL']==school].shape[0]!=0:
                            sample_data=sample_data[sample_data['SCHOOL']==school]
                            if sample_data[sample_data['PERFORMANCE']==performance].shape[0]!=0:
                                sample_data=sample_data[sample_data['PERFORMANCE']==performance]
                                if sample_data[sample_data['ADA']==ada].shape[0]!=0:
                                    sample_data=sample_data[sample_data['ADA']==ada]
                                    if sample_data[sample_data['ETHNICITY_RACE']==ethinicty].shape[0]!=0:
                                            sample_data=sample_data[sample_data['ETHNICITY_RACE']==ethinicty]
                                            if sample_data[sample_data['CITY']==city].shape[0]!=0:
                                                sample_data=sample_data[sample_data['CITY']==city]
                                                if sample_data[sample_data['FRM']==frm].shape[0]!=0:
                                                    sample_data=sample_data[sample_data['FRM']==frm] 
                                                    if sample_data[sample_data['SPED']==sped].shape[0]!=0:
                                                        sample_data=sample_data[sample_data['SPED']==sped]
                            
            people=list(sample_data['PERSON_GU'])
            school_list=[]
            for person in people:
                sub_list=copy.deepcopy(data_copy)
                sub_list=sub_list[sub_list['PERSON_GU']==person]
                if next_grade in list(sub_list['GRADE']):
                    sub_list=sub_list[sub_list['GRADE']==next_grade]
                    school_name=list(sub_list['SCHOOL'])[0]
                    school_list.append(school_name)
            if len(school_list)!=0:
                new_school=statistics.mode(school_list)
                new_school_name=school_map[str(new_school)][0]
                if new_school_name not in result:
                    result[new_school_name]={}
                if grade not in result[new_school_name]:
                    result[new_school_name][grade]=0
                result[new_school_name][grade]+=1
            else:
                if school_name not in result:
                    result[school_name]={}
                if grade not in result[school_name]:
                    result[school_name][grade]=0
                result[school_name][grade]+=1
     
#Higley Schools Grade and School pair
grade_school_pair={'Bridges Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Centennial Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Chaparral Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Coronado Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Cortina Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Gateway Pointe Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Higley Traditional Academy': ['KG', '01', '02', '03', '04', '05', '06'],
 'Power Ranch Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'San Tan Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Cooley Middle School': ['07', '08'],
 'Sossaman Middle School': ['07', '08'],
 'Higley High School': ['09', '10', '11', '12'],
 'Williams Field High School': ['09', '10', '11', '12'],
 'Higley Virtual Academy': ['07', '08', '09', '10', '11', '12']}

higley_enrollment=pd.DataFrame(result).transpose()
data_columns=list(higley_enrollment.columns)
for i in range(len(data_columns)):
    if data_columns[i] in range(1,12):
        data_columns[i]=str(data_columns[i]+1)
    else:
        if data_columns[i]==13:
            data_columns[i]='KG'
        if data_columns[i]==14:
            data_columns[i]=str(1) 
higley_enrollment.columns=data_columns
higley_enrollment=higley_enrollment.fillna(0)
higley_enrollment['KG']=0

list_a=list(grade_school_pair.keys())
list_b=list(higley_enrollment.index)

common_schools=set(list_a).intersection(set(list_b))
different_schools=set(list_b).difference(common_schools)

for key,val in grade_school_pair.items():
    empty_list=[]
    for grade in val:
        if grade!='KG':
            empty_list.append(str(int(grade)))
        else:
            empty_list.append('KG')
    grade_school_pair[key]=empty_list
    
for school in list(common_schools):
    for grade in  higley_enrollment.columns:
        if school in grade_school_pair:
            if grade not in grade_school_pair[school]:
                higley_enrollment.loc[school,grade]=0
                
for val in list(different_schools):
    higley_enrollment=higley_enrollment.drop(val)
    
repeating_students_df=copy.deepcopy(higley_enrollment)
repeating_students=higley_enrollment.sum().sum()
print(repeating_students)

reference_map={'Bridges Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Centennial Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Chaparral Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Coronado Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Cortina Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Gateway Pointe Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Higley Traditional Academy': ['KG', '01', '02', '03', '04', '05', '06'],
 'Power Ranch Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'San Tan Elementary': ['KG', '01', '02', '03', '04', '05', '06'],
 'Cooley Middle School': ['07', '08'],
 'Sossaman Middle School': ['07', '08'],
 'Higley High School': ['09', '10', '11', '12'],
 'Williams Field High School': ['09', '10', '11', '12'],
 'Higley Virtual Academy': ['07', '08', '09', '10', '11', '12']}

for key,val in reference_map.items():
    empty_list=[]
    for grade in val:
        if grade!='KG':
            empty_list.append(int(grade))
        else:
            empty_list.append(14)
    reference_map[key]=empty_list
    
enrollments=copy.deepcopy(data_copy)
enrollments=enrollments[enrollments['Year']==latest_year]
grade_school_pair={}

for index,row in enrollments.iterrows():
    school=enrollments['SCHOOL'][index]
    grade=enrollments['GRADE'][index]
    
    if grade not in grade_school_pair:
        grade_school_pair[grade]=[]
     
    if school not in grade_school_pair[grade]:
        grade_school_pair[grade].append(school)
        
reverse_school_map={}
inverse_school_map={}
for grade in school_map.columns:
    school_name=list(school_map[grade])[0]
    reverse_school_map[school_name]=grade
    try:
        inverse_school_map[int(grade)]=school_name
    except Exception as e:
        pass
intakes_count=0

for key,val in grade_school_pair.items():
    elements=[]
    for value in val:
        school=inverse_school_map[value]
        if school in reference_map and key in reference_map[school]:
            elements.append(value)
    grade_school_pair[key]=elements
    
new_intake=pd.read_csv('Future_Intakes.csv')[['School','GRADE','New_Intake']]

#abnormal records
abnormal=[]
for index,row in new_intake.iterrows():
    school_name=new_intake['School'][index]
    school=reverse_school_map[school_name]
    grade=new_intake['GRADE'][index]
    count=new_intake['New_Intake'][index]
    if grade=='PS':
        grade_modified=13
    elif grade=='KG':
        grade_modified=14
    else:
        grade_modified=int(grade)
    if int(school) in grade_school_pair[grade_modified] and grade in higley_enrollment.columns and school_name in higley_enrollment.index:
        higley_enrollment.loc[school_name,grade]+=count
        intakes_count+=count
    else:
        abnormal.append((school_name,grade))
print(intakes_count)

#post processing to upload data in the output buckets
total_students_df=copy.deepcopy(higley_enrollment)
df1=copy.deepcopy(repeating_students_df)
df2=copy.deepcopy(total_students_df)
result = df2.subtract(df1)
result=result.fillna(0)
columns=repeating_students_df.columns
empty=[]
for column in columns:
    empty.append('Grade_'+column)
repeating_students_df.columns=empty
columns=list(total_students_df.columns)
empty=[]
for column in columns:
    empty.append('Grade_'+column)
total_students_df.columns=empty
columns=list(result.columns)
empty=[]
for column in columns:
    empty.append('Grade_'+column)
result.columns=empty
total_students_df=total_students_df.reset_index()
total_students_df=total_students_df.rename(columns={'index':'School'})
repeating_students_df=repeating_students_df.reset_index()
repeating_students_df=repeating_students_df.rename(columns={'index':'School'})
result=result.reset_index()
result=result.rename(columns={'index':'School'})

repeating_students_df.to_csv('repeating_students_higley.csv',index=False)
total_students_df.to_csv('total_students_higley.csv',index=False)
result.to_csv('new_intake_students_higley.csv',index=False)

target_file='repeating_students_higley.csv'
target_path=current_path+target_file
output_bucket='higley-output-bucket'
folder_path='repeating-students/'
s3_client.upload_file(target_path, output_bucket,folder_path+target_file)

target_file='total_students_higley.csv'
target_path=current_path+target_file
output_bucket='higley-output-bucket'
folder_path='total_students/'
s3_client.upload_file(target_path, output_bucket, folder_path+target_file)

target_file='new_intake_students_higley.csv'
target_path=current_path+target_file
output_bucket='higley-output-bucket'
folder_path='new_intake-students/'
s3_client.upload_file(target_path, output_bucket,folder_path+ target_file)

In [None]:
#invoke lambda function for post processing
import json
lambda_client = boto3.client('lambda')
function_name ='bottomup-prediction'
payload = {
    'key1': 'value1',
    'key2': 'value2'
}
response = lambda_client.invoke(
    FunctionName=function_name,
    InvocationType='Event',  
    Payload=json.dumps(payload)
)
response