## Import Data & Merge Dataframes

In [1]:
# Import dependencies
import pandas as pd

In [2]:
# Create student info dataframe
student_info_df=pd.read_csv("Resources/Student_Info.csv")
student_info_df.head()

Unnamed: 0,student_id,school_name,grade_level_2019
0,13898,School G,1
1,27795,School G,1
2,22938,School E,1
3,22431,School F,1
4,18048,School E,1


In [3]:
# Create MAP_Scores dataframe
map_scores_df=pd.read_csv("Resources/MAP_Scores.csv")
map_scores_df.head()

Unnamed: 0,student_id,year,map_term,subject,rit_score,percentile,quartile,typical_fall_to_spring_growth
0,13898,2019,Fall,Reading,161,59,3,18
1,27795,2019,Fall,Reading,158,50,3,19
2,22938,2019,Fall,Reading,162,62,3,18
3,22431,2019,Fall,Reading,162,62,3,18
4,18048,2019,Fall,Reading,166,73,3,17


In [4]:
# Create Special_Programs dataframe
special_programs_df=pd.read_csv("Resources/Special_Programs.csv")
special_programs_df.head()

Unnamed: 0,student_id,program_name
0,13898,SPED
1,27795,SPED
2,22938,.
3,22431,.
4,18048,.


In [5]:
# Merge student_info and map_scores dataframes
info_scores_df = pd.merge(student_info_df, map_scores_df,how="outer", left_on='student_id', right_on='student_id')
info_scores_df.head()

Unnamed: 0,student_id,school_name,grade_level_2019,year,map_term,subject,rit_score,percentile,quartile,typical_fall_to_spring_growth
0,13898,School G,1,2019,Fall,Reading,161,59,3,18
1,13898,School G,1,2019,Fall,Math,166,69,3,20
2,13898,School G,1,2019,Spring,Reading,175,43,2,.
3,13898,School G,1,2019,Spring,Math,186,65,3,.
4,27795,School G,1,2019,Fall,Reading,158,50,3,19


In [6]:
# Merge info_scores_df and special_programs dataframes
kipp_maps_df = pd.merge(info_scores_df, special_programs_df,how="outer", left_on='student_id', right_on='student_id')
kipp_maps_df.head()

Unnamed: 0,student_id,school_name,grade_level_2019,year,map_term,subject,rit_score,percentile,quartile,typical_fall_to_spring_growth,program_name
0,13898,School G,1,2019,Fall,Reading,161,59,3,18,SPED
1,13898,School G,1,2019,Fall,Math,166,69,3,20,SPED
2,13898,School G,1,2019,Spring,Reading,175,43,2,.,SPED
3,13898,School G,1,2019,Spring,Math,186,65,3,.,SPED
4,27795,School G,1,2019,Fall,Reading,158,50,3,19,SPED


## Basic Data Cleaning

In [7]:
# Find unique values for school name
kipp_maps_df.school_name.unique()

array(['School G', 'School E', 'School F', 'School B', 'School C',
       'School D', 'School A'], dtype=object)

In [8]:
# Find unique values for grade level
kipp_maps_df.grade_level_2019.unique()

#QUESTION: Is 0 kindergarten? 

array([1, 2, 3, 4, 5, 6, 7, 8, 0], dtype=int64)

In [9]:
# Find unique values for year
kipp_maps_df.year.unique()

array([2019], dtype=int64)

In [10]:
# Find unique values for map_term
kipp_maps_df.map_term.unique()

array(['Fall', 'Spring'], dtype=object)

In [11]:
# Find unique values for subject
kipp_maps_df.subject.unique()

array(['Reading', 'Math', 'Read'], dtype=object)

In [12]:
# Identifying how many 'Read' values exist
kipp_maps_df['subject'].value_counts()

Math       5562
Reading    5454
Read        106
Name: subject, dtype: int64

In [13]:
# Replace 'Read' in the subject column with 'Reading'
kipp_maps_df.replace("Read", "Reading", inplace=True)

In [14]:
# Ensuring all 'Read' values have been changed to 'Reading'
kipp_maps_df['subject'].value_counts()

Math       5562
Reading    5560
Name: subject, dtype: int64

In [15]:
# Find unique values for program_name
kipp_maps_df.program_name.unique()

array(['SPED', '.', 'LEP', 'Tier 2', 'Gifted', nan, '504', 'SPED '],
      dtype=object)

In [16]:
# Fill nan for program_name with 'GenEd'
kipp_maps_df['program_name'] = kipp_maps_df['program_name'].fillna('GenEd')

# QUESTION: Is 'nan' a student with no special program? 

In [17]:
# Find unique values for program_name
kipp_maps_df.program_name.unique()

array(['SPED', '.', 'LEP', 'Tier 2', 'Gifted', 'GenEd', '504', 'SPED '],
      dtype=object)

In [18]:
# Replace '.' in the program_name column with 'GenEd'
kipp_maps_df['program_name'].replace(".", "GenEd", inplace=True)

In [19]:
# Find unique values for program_name
kipp_maps_df.program_name.unique()

array(['SPED', 'GenEd', 'LEP', 'Tier 2', 'Gifted', '504', 'SPED '],
      dtype=object)

In [20]:
# Showing a sample to ensure dataframe is functional/code is working
kipp_maps_df.sample(15)

Unnamed: 0,student_id,school_name,grade_level_2019,year,map_term,subject,rit_score,percentile,quartile,typical_fall_to_spring_growth,program_name
9499,17343,School A,8,2019,Spring,Math,239,66,3,.,GenEd
6777,26285,School D,6,2019,Spring,Math,220,37,2,.,GenEd
2406,26420,School E,2,2019,Spring,Reading,206,87,4,.,GenEd
984,22618,School F,1,2019,Fall,Reading,153,34,2,20,GenEd
1053,25530,School F,1,2019,Fall,Math,170,79,4,19,GenEd
5259,24589,School A,5,2019,Spring,Math,205,16,1,.,GenEd
8455,19885,School D,8,2019,Fall,Math,244,81,4,3,Gifted
4365,20899,School A,5,2019,Fall,Math,210,46,2,10,GenEd
2525,20175,School G,2,2019,Fall,Math,176,36,2,13,GenEd
104,11465,School F,1,2019,Fall,Reading,153,34,2,20,GenEd
