In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('datasets/StudentsPerformance_modified.csv')

In [3]:
print(df.head(10).to_string())

   gender race/ethnicity parental level of education         lunch test preparation course math score  reading score  writing score
0  female        group B           bachelor's degree      standard                    none         72           72.0           74.0
1  female        group C                some college      standard               completed         69           90.0           88.0
2  female        group B             master's degree      standard                    none         90           95.0           93.0
3    male        group A          associate's degree  free/reduced                    none         47           57.0           44.0
4    male        group C                some college      standard                    none         76           78.0           75.0
5  female        group B          associate's degree      standard                    none         71           83.0           78.0
6  female        group B                some college      standard          

In [4]:
print("(Rows, Cols) :", df.shape)

(Rows, Cols) : (1000, 8)


In [5]:
print("Total cells :", df.size)

Total cells : 8000


In [6]:
print(df.dtypes)

gender                          object
race/ethnicity                  object
parental level of education     object
lunch                           object
test preparation course         object
math score                      object
reading score                  float64
writing score                  float64
dtype: object


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   object 
 5   math score                   992 non-null    object 
 6   reading score                994 non-null    float64
 7   writing score                991 non-null    float64
dtypes: float64(2), object(6)
memory usage: 62.6+ KB
None


In [8]:
# Provides statistics for numerical columns
print(df.describe())

       reading score  writing score
count     994.000000     991.000000
mean       68.008048      69.487386
std        16.602270      29.563757
min         3.000000      10.000000
25%        58.000000      57.000000
50%        69.500000      69.000000
75%        79.000000      79.000000
max       100.000000     567.000000


In [9]:
# Data type conversion

# converts to numeric, if any cell has non-numeric then fills it with NaN, then fill or drop the NaN cells
df['math score'] = pd.to_numeric(df['math score'], errors="coerce")
df.dropna(subset=["math score"], inplace=True)

# convert object to categorical 
df['gender'] = df['gender'].astype('category')
df['race/ethnicity'] = df['race/ethnicity'].astype('category')
df['lunch'] = df['lunch'].astype('category')

In [10]:
# Returns a same dataframe, but with true/false if null values present
df.isnull()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False


In [11]:
# Count of null values in each column
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  6
writing score                  8
dtype: int64

In [12]:
# Removing null values

# 1] Remove the entire row
df.dropna(inplace=True)

# 2] Replace with mean, median, mode
reading_score_mean = df['reading score'].mean()
writing_score_mean = df['writing score'].mean()
df['reading score'].fillna(reading_score_mean, inplace=True)
df['writing score'].fillna(writing_score_mean, inplace=True)

# 3] Using forward fill or backward fill
df.ffill(inplace=True)
df.bfill(inplace=True)

In [13]:
# Converting categorical variables to quantitative variables

# 1] One hot encoding - Creates binary columns for each category in a categorical variable, with 1s indicating the presence of a category and 0s indicating the absence.
df_encoded = pd.get_dummies(df, columns = ['gender', 'lunch', 'race/ethnicity'])
df_encoded.info()
df.head(5).to_string()

# 2] Label encoding - Label encoding assigns a unique integer to each category in a categorical variable.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df.head(5).to_string()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   parental level of education  943 non-null    object 
 1   test preparation course      943 non-null    object 
 2   math score                   943 non-null    float64
 3   reading score                943 non-null    float64
 4   writing score                943 non-null    float64
 5   gender_female                943 non-null    uint8  
 6   gender_male                  943 non-null    uint8  
 7   lunch_free/reduced           943 non-null    uint8  
 8   lunch_standard               943 non-null    uint8  
 9   race/ethnicity_group A       943 non-null    uint8  
 10  race/ethnicity_group B       943 non-null    uint8  
 11  race/ethnicity_group C       943 non-null    uint8  
 12  race/ethnicity_group D       943 non-null    uint8  
 13  race/ethnicity_group

"   gender race/ethnicity parental level of education         lunch test preparation course  math score  reading score  writing score\n0       0        group B           bachelor's degree      standard                    none        72.0           72.0           74.0\n1       0        group C                some college      standard               completed        69.0           90.0           88.0\n2       0        group B             master's degree      standard                    none        90.0           95.0           93.0\n3       1        group A          associate's degree  free/reduced                    none        47.0           57.0           44.0\n4       1        group C                some college      standard                    none        76.0           78.0           75.0"

In [14]:
# Normalization

columns_to_normalize1 = ['math score']

# 1] MinMax normalization - It rescales the values of a numerical variable to a fixed range, typically between 0 and 1.
for column in columns_to_normalize1:
    df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
df.head(5).to_string()

columns_to_normalize2 = ['reading score']
# 2] Zscore normalization - It transforms the values of a numerical variable to have a mean of 0 and a standard deviation of 1.
for column in columns_to_normalize2:
    df[column] = (df[column] - df[column].mean()) / df[column].std()
df.head(5).to_string()

"   gender race/ethnicity parental level of education         lunch test preparation course  math score  reading score  writing score\n0       0        group B           bachelor's degree      standard                    none    0.380615       0.219457           74.0\n1       0        group C                some college      standard               completed    0.373522       1.344855           88.0\n2       0        group B             master's degree      standard                    none    0.423168       1.657465           93.0\n3       1        group A          associate's degree  free/reduced                    none    0.321513      -0.718374           44.0\n4       1        group C                some college      standard                    none    0.390071       0.594590           75.0"