In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import requests
import json
import io

In [2]:
# load the data
math_df = pd.read_csv('student-mat.csv')
por_df = pd.read_csv('student-por.csv')
math_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [3]:
#### Data Clean for math_df and por_df

## Removing rows with missing values
math_df.dropna(inplace=True)
por_df.dropna(inplace=True)

## Imputing missing 'absensces' with the mean weight
math_df['absences'].fillna(math_df['absences'].mean(), inplace=True)
por_df['absences'].fillna(por_df['absences'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  math_df['absences'].fillna(math_df['absences'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  por_df['absences'].fillna(por_df['absences'].mean(), inplace=True)


In [4]:
# Check the shape of the data
math_df.shape, por_df.shape

((395, 33), (649, 33))

In [9]:
# Check if there is still any null values
print("Math dataset:")
print(math_df.isnull().sum())
print()
print("Portuguese dataset:")
print(por_df.isnull().sum())

Math dataset:
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

Portuguese dataset:
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc    

In [None]:
# Rename columns to be more descriptive
math_df.rename(columns={'Medu': 'mother_edu'}, inplace=True)
por_df.rename(columns={'Medu': 'mother_edu'}, inplace=True)

math_df.rename(columns={'Fedu': 'father_edu'}, inplace=True)
por_df.rename(columns={'Fedu': 'father_edu'}, inplace=True)

math_df.rename(columns={'Mjob': 'mother_job'}, inplace=True)
por_df.rename(columns={'Mjob': 'mother_job'}, inplace=True)

math_df.rename(columns={'Fjob': 'father_job'}, inplace=True)
por_df.rename(columns={'Fjob': 'father_job'}, inplace=True)

math_df.rename(columns={'Pstatus': 'parent_status'}, inplace=True)
por_df.rename(columns={'Pstatus': 'parent_status'}, inplace=True)

math_df.rename(columns={'famsize': 'family_size'}, inplace=True)
por_df.rename(columns={'famsize': 'family_size'}, inplace=True)

math_df.rename(columns={'famsup': 'family_support'}, inplace=True)
por_df.rename(columns={'famsup': 'family_support'}, inplace=True)

math_df.rename(columns={'famrel': 'family_relationship'}, inplace=True)
por_df.rename(columns={'famrel': 'family_relationship'}, inplace=True)

math_df.rename(columns={'schoolsup': 'edu_support'}, inplace=True)
por_df.rename(columns={'schoolsup': 'edu_support'}, inplace=True)

math_df.rename(columns={'Dalc': 'workday_alc'}, inplace=True)
por_df.rename(columns={'Dalc': 'workday_alc'}, inplace=True)

math_df.rename(columns={'Walc': 'weekend_alc'}, inplace=True)
por_df.rename(columns={'Walc': 'weekend_alc'}, inplace=True)

In [None]:
math_df

In [None]:
def standardize_school(school):
    if school == 'GP':
        return 'Gabriel Pereira'
    else:
        return 'Mousinho da Silveira'

def standardize_address(address):
    if address == 'U':
        return 'urban'
    else:
        return 'rural'
    
def standardize_parent_status(status):
    if status == 'A':
        return 'apart'
    else:
        return 'together'

In [None]:
math_df['school'] = math_df['school'].apply(standardize_school)
por_df['school'] = por_df['school'].apply(standardize_school)

math_df['address'] = math_df['address'].apply(standardize_address)
por_df['address'] = por_df['address'].apply(standardize_address)

math_df['parent_status'] = math_df['parent_status'].apply(standardize_parent_status)
por_df['parent_status'] = por_df['parent_status'].apply(standardize_parent_status)

In [None]:
# Step 1: Missing Values - Checking for missing values
print("Missing values in Math dataset:\n", math_df.isnull().sum())
print("\nMissing values in Portuguese dataset:\n", por_df.isnull().sum())

In [None]:
# Step 2: Duplicate Entries - Checking for duplicates
math_duplicates = math_df.duplicated().sum()
por_duplicates = por_df.duplicated().sum()
print("Number of duplicate entries in Math dataset: ", math_duplicates)
print("Number of duplicate entries in Portuguese dataset: ", por_duplicates)

In [None]:
# Convert to appropriate data types if necessary (Step 3) - a quick check
math_dtypes = math_df.dtypes
por_dtypes = por_df.dtypes
print("Data types in Math dataset:\n", math_dtypes)
print("\nData types in Portuguese dataset:\n", por_dtypes)

In [None]:
# Step 4: Outliers - Checking for outliers
math_df.describe()

In [None]:
por_df.describe()

In [None]:
math_df

In [None]:
math_df['age'].value_counts()

In [None]:
por_df['age'].value_counts()

In [None]:
math_df['reason'].value_counts()

In [None]:
# math_df = math_df[['sex', 'G1', 'G2', 'G3', 'health', 'absences', 'family_relationship', 'workday_alc', 'weekend_alc', 'studytime', 'freetime', 'goout']]
# por_df = por_df[['sex', 'G1', 'G2', 'G3', 'health', 'absences', 'family_relationship', 'workday_alc', 'weekend_alc', 'studytime', 'freetime', 'goout']]

In [None]:
# math_df