In [1]:
import acquire as a
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = a.acquire_edu_data()

## First thing I want to do is standardize all the column names. 

In [3]:
df.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Gender', 'EthnicGroup', 'ParentEduc',
       'LunchType', 'TestPrep', 'ParentMaritalStatus', 'PracticeSport',
       'IsFirstChild', 'NrSiblings', 'TransportMeans', 'WklyStudyHours',
       'MathScore', 'ReadingScore', 'WritingScore'],
      dtype='object')

    Key takaways:
    - there is a combination of upper and lowercase letters
    - Some columns have sapces 
    - unnamed columns are mirror of index so we can remove those columns. 
    

In [4]:
# aets look at what our data is like 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30641 entries, 0 to 30640
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0.1         30641 non-null  int64  
 1   Unnamed: 0           30641 non-null  int64  
 2   Gender               30641 non-null  object 
 3   EthnicGroup          28801 non-null  object 
 4   ParentEduc           28796 non-null  object 
 5   LunchType            30641 non-null  object 
 6   TestPrep             28811 non-null  object 
 7   ParentMaritalStatus  29451 non-null  object 
 8   PracticeSport        30010 non-null  object 
 9   IsFirstChild         29737 non-null  object 
 10  NrSiblings           29069 non-null  float64
 11  TransportMeans       27507 non-null  object 
 12  WklyStudyHours       29686 non-null  object 
 13  MathScore            30641 non-null  int64  
 14  ReadingScore         30641 non-null  int64  
 15  WritingScore         30641 non-null 

In [5]:
# lets addess the first two with one line of code
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
# now for the unnamed columns
df.drop(columns=[c for c in df.columns if 'unnamed' in c],inplace=True)


## Here we are looking to address the null values . This is a special case with nulls becasue every data point is a student. To use df.dropna() will drop a student who didnt have a voice and we don't want to take that away from a student.

In [7]:
# first I want to see how many missing values there are in the dataset and set it to output it by % of columns
(df.isna().sum() / len(df) )* 100 

gender                  0.000000
ethnicgroup             6.005026
parenteduc              6.021344
lunchtype               0.000000
testprep                5.972390
parentmaritalstatus     3.883685
practicesport           2.059332
isfirstchild            2.950295
nrsiblings              5.130381
transportmeans         10.228126
wklystudyhours          3.116739
mathscore               0.000000
readingscore            0.000000
writingscore            0.000000
dtype: float64

    Addressing the first null, I'm gonna drop the column ethnicgroup. I did this to prevent any potential biases or unfair labeling based on ethnicity, and to ensure that the analysis is focused solely on the other factors that may be affecting educational performance. It is important to note that removing a variable like ethnicity from the analysis does not mean that it is not an important factor, but rather that in this specific analysis, we are choosing to focus on other variables.

In [8]:
df.drop(columns='ethnicgroup', inplace=True)

In [9]:
# first I want to see how many missing values there are in the dataset and set it to output it by % of columns
(df.isna().sum() / len(df) )* 100 

gender                  0.000000
parenteduc              6.021344
lunchtype               0.000000
testprep                5.972390
parentmaritalstatus     3.883685
practicesport           2.059332
isfirstchild            2.950295
nrsiblings              5.130381
transportmeans         10.228126
wklystudyhours          3.116739
mathscore               0.000000
readingscore            0.000000
writingscore            0.000000
dtype: float64

    We are going to impute the nulls until they are able to be update. Since all the nulls are less than 11 % of the columns in the dataset I feel comfortable enough to impute them.

In [10]:
has_null =[]

for cols in df.columns:
    if df[cols].isna().sum() > 0:
        has_null.append(cols)
    else:
        pass

In [11]:
# Here we trust but verify our code, and ensure that the null columns were added to the list
has_null

['parenteduc',
 'testprep',
 'parentmaritalstatus',
 'practicesport',
 'isfirstchild',
 'nrsiblings',
 'transportmeans',
 'wklystudyhours']

In [15]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')

# Fill the missing values in each column
for col in df.columns:
        if df[col].isna().sum() > 0:
            df[col] = imputer.fit_transform(df[col].values.reshape(-1, 1))

ValueError: 2

In [None]:
(df.isna().sum() / len(df) )* 100 

In [None]:
df.testprep.value_counts()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['lunchtype'].value_counts()

In [None]:
df.columns = ['is_male', 'parent_educ', 'free_reduced_lunch', 'test_prep_completed', 'parent_marital_status',
       'practicesport', 'is_first_child', 'nrsiblings', 'rides_bus',
       'wkly_study_hours', 'math_score', 'reading_score', 'writing_score']

In [None]:
value_change = {
    'female' : 0,
    'male' : 1,
    'no' : 0,
    'yes': 1,
    'school_bus': 1,
    'private' : 0,
    'none' : 0,
    'completed' : 1,
    'sometimes' : 1,
    'regularly' : 1,
    'never' : 0,
    'free/reduced' : 1,
    'standard' : 0
}

In [None]:
df.replace(to_replace=value_change,inplace=True)

In [None]:
# we are going to engineer the final score.
scores = ['writing_score' ,'reading_score', 'math_score']
df['final_score'] =round( df[scores].mean(axis=1),2)

In [None]:
df.info()

In [None]:
object_columns = list()
for cols in df.columns:
    if df[cols].dtype == 'O':
        object_columns.append(cols)
    

In [None]:
df = pd.get_dummies(df, columns=object_columns, drop_first= True)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
def split(df):
    '''
    This function splits a dataframe into 
    train, validate, and test in order to explore the data and to create and validate models. 
    It takes in a dataframe and contains an integer for setting a seed for replication. 
    Test is 20% of the original dataset. The remaining 80% of the dataset is 
    divided between valiidate and train, with validate being .30*.80= 24% of 
    the original dataset, and train being .70*.80= 56% of the original dataset. 
    The function returns, train, validate and test dataframes. 
    '''
    train, test = train_test_split(df, test_size = .2, random_state=123)   
    train, validate = train_test_split(train, test_size=.3, random_state=123)
    
    return train, validate, test

In [None]:
from sklearn.impute import SimpleImputer

def prepare_edu():
    df = a.acquire_edu_data()
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    df.drop(columns=[c for c in df.columns if 'unnamed' in c],inplace=True)
    df.drop(columns='ethnicgroup', inplace=True)
    df.drop(columns=[c for c in df.columns if 'unnamed' in c],inplace=True)
    has_null =[]

    for cols in df.columns:
        if df[cols].isna().sum() > 0:
            has_null.append(cols)
        else:
            pass

    imputer = SimpleImputer(strategy='most_frequent')

    # Fill the missing values in each column
    for col in df.columns:
        if df[col].isna().sum() > 0:
            df[col] = imputer.fit_transform(df[col].values.reshape(-1, 1))


    df.columns = ['is_male', 'parent_educ', 'free_reduced_lunch', 'test_prep_completed', 'parent_marital_status',
           'practicesport', 'is_first_child', 'nrsiblings', 'rides_bus',
           'wkly_study_hours', 'math_score', 'reading_score', 'writing_score']

    value_change = {
    'female' : 0,
    'male' : 1,
    'no' : 0,
    'yes': 1,
    'school_bus': 1,
    'private' : 0,
    'none' : 0,
    'completed' : 1,
    'sometimes' : 1,
    'regularly' : 1,
    'never' : 0,
    'free/reduced' : 1,
    'standard' : 0
    }
    

    df.replace(to_replace=value_change,inplace=True)

    scores = ['writing_score' ,'reading_score', 'math_score']
    df['final_score'] =round( df[scores].mean(axis=1),2)


    object_columns = list()
    for cols in df.columns:
        if df[cols].dtype == 'O':
            object_columns.append(cols)
    df = pd.get_dummies(df, columns=object_columns, drop_first= True)
    df.drop_duplicates(inplace=True)
    return df


In [None]:
df =  prepare_edu()

In [None]:
df.columns

In [None]:
df.duplicated().sum()