# Imports

In [1]:
import pandas as pd 
import numpy as np 

# Get our prepared data

In [2]:
df = pd.read_csv('prepared_edu_dataframe.csv')

In [3]:
df.head()

Unnamed: 0,gender,parent_educ,lunch_type,test_prep,parent_marital_status,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,female,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,female,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,female,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,male,associate's degree,free/reduced,none,married,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,male,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0


# Lets look at our categorical values and see if we can get them into a bianry repersentation.

In [4]:
df.gender.value_counts()

gender
female    15424
male      15217
Name: count, dtype: int64

    the distribution between males and females is pretty equal. so we will convert the columns to is_male

In [5]:
df.rename(columns= {'gender' : 'is_male'}, inplace= True)

In [6]:
gender_values = {'male': 1, 'female': 0}
df['is_male'] = df['is_male'].replace(gender_values)


In [7]:
df.head()

Unnamed: 0,is_male,parent_educ,lunch_type,test_prep,parent_marital_status,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,bachelor's degree,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,associate's degree,free/reduced,none,married,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0


# Now lets do `parent_edu` and find a way to get  the values numeric

In [8]:
df.parent_educ.value_counts()

parent_educ
some college          8478
high school           5687
associate's degree    5550
some high school      5517
bachelor's degree     3386
master's degree       2023
Name: count, dtype: int64

In [9]:
# Lets try to see how college or no college works.

# well make the valies 1 where 1 repersents a college degree and 0 repersents a high school dipolma

# make a dictionary of values to chagege 
parents_edu_dict = {
    'some college' : 1 ,
    'associate\'s degree' : 1 ,
    'bachelor\'s degree' : 1 ,
    'master\'s degree' : 1 ,
    'some high school' : 0 ,
    'high school' : 0
}
# rename columns to better suit the values

In [10]:
df.rename(columns={'parent_educ': 'has_college_degree'}, inplace=True)

In [11]:
df['has_college_degree'] = df['has_college_degree'].map(parents_edu_dict)

In [12]:
df.head()

Unnamed: 0,is_male,has_college_degree,lunch_type,test_prep,parent_marital_status,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,standard,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,free/reduced,none,married,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0


# Now we will convert `lunch_type` into a biary repersentation:

In [13]:
df.lunch_type.value_counts()

lunch_type
standard        19905
free/reduced    10736
Name: count, dtype: int64

In [14]:
# we will use free and reduce lunch to repersent a 1 while standard lunch is repersetned by a 0 
lunch_type_dict = {
    'free/reduced': 1 , 
    'standard': 1
}

df.rename(columns = {'lunch_type': 'free_reduced_lunch'}, inplace = True)
df['free_reduced_lunch'] = df.free_reduced_lunch.map(lunch_type_dict)

In [15]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,test_prep,parent_marital_status,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,none,married,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,none,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,none,married,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0


# Now on to `test_prep`

In [16]:
df.test_prep.value_counts()

test_prep
none         20686
completed     9955
Name: count, dtype: int64

In [17]:
test_prep_dict = {
    'none': 0,
    'completed' : 1
    
}

In [18]:
df.rename(columns={'test_prep': 'completed_test_prep'}, inplace=True)

In [19]:
df['completed_test_prep'] = df['completed_test_prep'].map(test_prep_dict)

In [20]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parent_marital_status,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,married,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,0,married,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,0,single,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,0,married,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,0,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0


    HMMMM ive done the same code a few times lets functionize this 

In [21]:
def rename_mapping_fucntion(df , map_dict,old_col_name, new_col_name):
    '''
    df: data is the dataframe you are working with.
    map_dict: map_dict is a dictionary that maps original column values to new column values.
    old_col_name: original column name. This should be a string literal.
    new_col_name: new column name to replace the original column name. This should be a string literal.
    '''
    df.rename(columns= {old_col_name : new_col_name}, inplace = True)
    df[new_col_name] =df [new_col_name].map(map_dict)
    return df

# Now we work towards`parent_marital_status`

In [22]:
df.parent_marital_status.value_counts()

parent_marital_status
married     18034
single       7097
divorced     4919
widowed       591
Name: count, dtype: int64

In [23]:
# For the sake of group we will convert this to married or not married.
parent_marital_status_dict = {
    'married' : 1 ,
    'single' : 0 ,
    'divorced' : 0 , 
    'widowed' : 0 
}

In [24]:
rename_mapping_fucntion(df, map_dict= parent_marital_status_dict ,old_col_name = 'parent_marital_status', new_col_name = 'parents_married')

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,1,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,0,1,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,0,0,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,0,1,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,0,1,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,0,0,1,0,0,sometimes,no,2.0,school_bus,5 - 10,59,61,65,62.0
30637,1,0,1,0,0,regularly,no,1.0,private,5 - 10,58,53,51,54.0
30638,0,0,1,1,1,sometimes,no,1.0,private,5 - 10,61,70,67,66.0
30639,0,1,1,1,1,regularly,no,3.0,school_bus,5 - 10,82,90,93,88.0


In [25]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practice_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,1,regularly,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,0,1,sometimes,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,0,0,sometimes,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,0,1,never,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,0,1,sometimes,yes,0.0,school_bus,5 - 10,76,78,75,76.0


# Converting `practice_sport`

In [26]:
df.practice_sport.value_counts()

practice_sport
sometimes    15844
regularly    10793
never         4004
Name: count, dtype: int64

    We ideally want to break this down into two categories we can do practiced_sports and convert the never to 0 and gthe other ones to a 1 the only issue I see is this is going to unbalance this class. We are more than likely not going to use this column in this specific analysis as it does not pertain to parents social status. We will convert it in the event another analysis can use this column. 

In [27]:
practice_sport_dict = {
    'sometimes' : 1 , 
    'regularly' : 1 ,
    'never' : 0
}


In [28]:
rename_mapping_fucntion(df, map_dict= practice_sport_dict ,old_col_name = 'practice_sport', new_col_name = 'practiced_sport')

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,1,1,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,0,1,1,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,0,0,1,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,0,1,0,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,0,1,1,yes,0.0,school_bus,5 - 10,76,78,75,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,0,0,1,0,0,1,no,2.0,school_bus,5 - 10,59,61,65,62.0
30637,1,0,1,0,0,1,no,1.0,private,5 - 10,58,53,51,54.0
30638,0,0,1,1,1,1,no,1.0,private,5 - 10,61,70,67,66.0
30639,0,1,1,1,1,1,no,3.0,school_bus,5 - 10,82,90,93,88.0


In [29]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,1,1,yes,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,0,1,1,yes,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,0,0,1,yes,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,0,1,0,no,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,0,1,1,yes,0.0,school_bus,5 - 10,76,78,75,76.0


In [30]:
is_first_child_dict = {
    'yes' : 1 ,
    'no' : 0
}

In [31]:
rename_mapping_fucntion(df, map_dict= is_first_child_dict ,old_col_name = 'is_first_child', new_col_name = 'is_first_child')

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,transport_means,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,1,1,1,3.0,school_bus,< 5,71,71,74,72.0
1,0,1,1,0,1,1,1,0.0,school_bus,5 - 10,69,90,88,82.0
2,0,1,1,0,0,1,1,4.0,school_bus,< 5,87,93,91,90.0
3,1,1,1,0,1,0,0,1.0,school_bus,5 - 10,45,56,42,48.0
4,1,1,1,0,1,1,1,0.0,school_bus,5 - 10,76,78,75,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,0,0,1,0,0,1,0,2.0,school_bus,5 - 10,59,61,65,62.0
30637,1,0,1,0,0,1,0,1.0,private,5 - 10,58,53,51,54.0
30638,0,0,1,1,1,1,0,1.0,private,5 - 10,61,70,67,66.0
30639,0,1,1,1,1,1,0,3.0,school_bus,5 - 10,82,90,93,88.0


In [32]:
df.transport_means.value_counts()

transport_means
school_bus    19279
private       11362
Name: count, dtype: int64

In [33]:
transport_means_dict = {
    'school_bus' : 1,
    'private' : 0
}

In [34]:
rename_mapping_fucntion(df, map_dict= transport_means_dict ,old_col_name = 'transport_means', new_col_name = 'rides_bus')

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,rides_bus,wkly_study_hours,math_score,reading_score,writing_score,final_score
0,0,1,1,0,1,1,1,3.0,1,< 5,71,71,74,72.0
1,0,1,1,0,1,1,1,0.0,1,5 - 10,69,90,88,82.0
2,0,1,1,0,0,1,1,4.0,1,< 5,87,93,91,90.0
3,1,1,1,0,1,0,0,1.0,1,5 - 10,45,56,42,48.0
4,1,1,1,0,1,1,1,0.0,1,5 - 10,76,78,75,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30636,0,0,1,0,0,1,0,2.0,1,5 - 10,59,61,65,62.0
30637,1,0,1,0,0,1,0,1.0,0,5 - 10,58,53,51,54.0
30638,0,0,1,1,1,1,0,1.0,0,5 - 10,61,70,67,66.0
30639,0,1,1,1,1,1,0,3.0,1,5 - 10,82,90,93,88.0


In [35]:
df.wkly_study_hours.value_counts()

wkly_study_hours
5 - 10    17201
< 5        8238
> 10       5202
Name: count, dtype: int64

In [36]:
# we may be bale to rund pandas dummy values to accomplish the breakdown of these values
df = pd.get_dummies(df,columns= ['wkly_study_hours'], dtype= int, drop_first= True)

In [37]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,rides_bus,math_score,reading_score,writing_score,final_score,wkly_study_hours_< 5,wkly_study_hours_> 10
0,0,1,1,0,1,1,1,3.0,1,71,71,74,72.0,1,0
1,0,1,1,0,1,1,1,0.0,1,69,90,88,82.0,0,0
2,0,1,1,0,0,1,1,4.0,1,87,93,91,90.0,1,0
3,1,1,1,0,1,0,0,1.0,1,45,56,42,48.0,0,0
4,1,1,1,0,1,1,1,0.0,1,76,78,75,76.0,0,0


    Now to avoid potiental data leakage, we are going to drop math writing and reading scores

In [38]:
df.drop(columns= ['math_score', 'reading_score', 'writing_score'], inplace = True)

In [39]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,rides_bus,final_score,wkly_study_hours_< 5,wkly_study_hours_> 10
0,0,1,1,0,1,1,1,3.0,1,72.0,1,0
1,0,1,1,0,1,1,1,0.0,1,82.0,0,0
2,0,1,1,0,0,1,1,4.0,1,90.0,1,0
3,1,1,1,0,1,0,0,1.0,1,48.0,0,0
4,1,1,1,0,1,1,1,0.0,1,76.0,0,0


# Now to conclude preprocessing we are going to try to get the risk scores now.

In [40]:
df.final_score.min()

9.0

In [41]:
df.final_score.max()

100.0

    We will use the stadard grading scale to manually identify students who failed or had a grade below 75, while a 75 is passing it also identifies students who may have struggled to maintain their grade

In [42]:
df['risk_cat'] = 0  # Initialize the 'risk_cat' column with 0

# Assign 1 to 'risk_cat' if 'final_score' is greater than or equal to 75.0
df.loc[df['final_score'] <= 75.0, 'risk_cat'] = 1

In [43]:
df.head()

Unnamed: 0,is_male,has_college_degree,free_reduced_lunch,completed_test_prep,parents_married,practiced_sport,is_first_child,nr_siblings,rides_bus,final_score,wkly_study_hours_< 5,wkly_study_hours_> 10,risk_cat
0,0,1,1,0,1,1,1,3.0,1,72.0,1,0,1
1,0,1,1,0,1,1,1,0.0,1,82.0,0,0,0
2,0,1,1,0,0,1,1,4.0,1,90.0,1,0,0
3,1,1,1,0,1,0,0,1.0,1,48.0,0,0,1
4,1,1,1,0,1,1,1,0.0,1,76.0,0,0,0


## Since we used final score we are dropping that column as well. 

In [44]:
df.drop(columns='final_score', inplace=True)

In [45]:
df.to_csv('preprocessed_edu_data.csv', index=False)

In [46]:
df.to_excel('preprocessed_edu_data.xlsx', index=False)

# Conclusion
### In this notebook, we performed several preprocessing steps on the educational data. The steps included:

1. **Data Cleaning:** We renamed several columns for better understanding and replaced their values to binary (0 and 1) for easier processing in future analysis.
2. **Feature Engineering:** We transformed categorical variables into binary representation. This included columns like 'gender', 'parent_educ', 'lunch_type', 'test_prep', 'parent_marital_status', 'practice_sport', 'is_first_child', 'transport_means', and 'wkly_study_hours'.
3. **Risk Category:** We created a new column 'risk_cat' to identify students who had a final score less than or equal to 75. This will help in identifying students who might be at risk academically.
4. **Data Export:** Finally, we exported the preprocessed data into a new CSV file for further analysis.

### These preprocessing steps will help in the next steps of our analysis where we will build predictive models to identify students at risk based on social characteristics.