In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import requests
import json
import io

In [2]:
# load the data
math_df = pd.read_csv('student-mat.csv')
por_df = pd.read_csv('student-por.csv')
math_df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [3]:
# Rename columns to be more descriptive
math_df.rename(columns={'Medu': 'mother_edu'}, inplace=True)
por_df.rename(columns={'Medu': 'mother_edu'}, inplace=True)

math_df.rename(columns={'Fedu': 'father_edu'}, inplace=True)
por_df.rename(columns={'Fedu': 'father_edu'}, inplace=True)

math_df.rename(columns={'Mjob': 'mother_job'}, inplace=True)
por_df.rename(columns={'Mjob': 'mother_job'}, inplace=True)

math_df.rename(columns={'Fjob': 'father_job'}, inplace=True)
por_df.rename(columns={'Fjob': 'father_job'}, inplace=True)

math_df.rename(columns={'Pstatus': 'parent_status'}, inplace=True)
por_df.rename(columns={'Pstatus': 'parent_status'}, inplace=True)

math_df.rename(columns={'famsize': 'family_size'}, inplace=True)
por_df.rename(columns={'famsize': 'family_size'}, inplace=True)

math_df.rename(columns={'famsup': 'family_support'}, inplace=True)
por_df.rename(columns={'famsup': 'family_support'}, inplace=True)

math_df.rename(columns={'famrel': 'family_relationship'}, inplace=True)
por_df.rename(columns={'famrel': 'family_relationship'}, inplace=True)

math_df.rename(columns={'schoolsup': 'edu_support'}, inplace=True)
por_df.rename(columns={'schoolsup': 'edu_support'}, inplace=True)

math_df.rename(columns={'Dalc': 'workday_alc'}, inplace=True)
por_df.rename(columns={'Dalc': 'workday_alc'}, inplace=True)

math_df.rename(columns={'Walc': 'weekend_alc'}, inplace=True)
por_df.rename(columns={'Walc': 'weekend_alc'}, inplace=True)

In [4]:
math_df

Unnamed: 0,school,sex,age,address,family_size,parent_status,mother_edu,father_edu,mother_job,father_job,...,family_relationship,freetime,goout,workday_alc,weekend_alc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [5]:
def standardize_school(school):
    if school == 'GP':
        return 'Gabriel Pereira'
    else:
        return 'Mousinho da Silveira'

def standardize_address(address):
    if address == 'U':
        return 'urban'
    else:
        return 'rural'
    
def standardize_parent_status(status):
    if status == 'A':
        return 'apart'
    else:
        return 'together'

In [6]:
math_df['school'] = math_df['school'].apply(standardize_school)
por_df['school'] = por_df['school'].apply(standardize_school)

math_df['address'] = math_df['address'].apply(standardize_address)
por_df['address'] = por_df['address'].apply(standardize_address)

math_df['parent_status'] = math_df['parent_status'].apply(standardize_parent_status)
por_df['parent_status'] = por_df['parent_status'].apply(standardize_parent_status)

In [7]:
# Step 1: Missing Values - Checking for missing values
print("Missing values in Math dataset:\n", math_df.isnull().sum())
print("\nMissing values in Portuguese dataset:\n", por_df.isnull().sum())

Missing values in Math dataset:
 school                 0
sex                    0
age                    0
address                0
family_size            0
parent_status          0
mother_edu             0
father_edu             0
mother_job             0
father_job             0
reason                 0
guardian               0
traveltime             0
studytime              0
failures               0
edu_support            0
family_support         0
paid                   0
activities             0
nursery                0
higher                 0
internet               0
romantic               0
family_relationship    0
freetime               0
goout                  0
workday_alc            0
weekend_alc            0
health                 0
absences               0
G1                     0
G2                     0
G3                     0
dtype: int64

Missing values in Portuguese dataset:
 school                 0
sex                    0
age                    0
address       

In [8]:
# Step 2: Duplicate Entries - Checking for duplicates
math_duplicates = math_df.duplicated().sum()
por_duplicates = por_df.duplicated().sum()
print("Number of duplicate entries in Math dataset: ", math_duplicates)
print("Number of duplicate entries in Portuguese dataset: ", por_duplicates)

Number of duplicate entries in Math dataset:  0
Number of duplicate entries in Portuguese dataset:  0


In [9]:
# Convert to appropriate data types if necessary (Step 3) - a quick check
math_dtypes = math_df.dtypes
por_dtypes = por_df.dtypes
print("Data types in Math dataset:\n", math_dtypes)
print("\nData types in Portuguese dataset:\n", por_dtypes)

Data types in Math dataset:
 school                 object
sex                    object
age                     int64
address                object
family_size            object
parent_status          object
mother_edu              int64
father_edu              int64
mother_job             object
father_job             object
reason                 object
guardian               object
traveltime              int64
studytime               int64
failures                int64
edu_support            object
family_support         object
paid                   object
activities             object
nursery                object
higher                 object
internet               object
romantic               object
family_relationship     int64
freetime                int64
goout                   int64
workday_alc             int64
weekend_alc             int64
health                  int64
absences                int64
G1                      int64
G2                      int64
G3         

In [10]:
# Step 4: Outliers - Checking for outliers
math_df.describe()

Unnamed: 0,age,mother_edu,father_edu,traveltime,studytime,failures,family_relationship,freetime,goout,workday_alc,weekend_alc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [11]:
por_df.describe()

Unnamed: 0,age,mother_edu,father_edu,traveltime,studytime,failures,family_relationship,freetime,goout,workday_alc,weekend_alc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


In [12]:
math_df

Unnamed: 0,school,sex,age,address,family_size,parent_status,mother_edu,father_edu,mother_job,father_job,...,family_relationship,freetime,goout,workday_alc,weekend_alc,health,absences,G1,G2,G3
0,Gabriel Pereira,F,18,urban,GT3,apart,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,Gabriel Pereira,F,17,urban,GT3,together,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,Gabriel Pereira,F,15,urban,LE3,together,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,Gabriel Pereira,F,15,urban,GT3,together,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,Gabriel Pereira,F,16,urban,GT3,together,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,Mousinho da Silveira,M,20,urban,LE3,apart,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,Mousinho da Silveira,M,17,urban,LE3,together,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,Mousinho da Silveira,M,21,rural,GT3,together,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,Mousinho da Silveira,M,18,rural,LE3,together,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10
