In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [121]:
df = pd.read_csv('StudentsPerformance.csv')

In [122]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [123]:
# Create Total Marks
df['total_marks'] = (
    df['math score'] +
    df['reading score'] +
    df['writing score']
).astype(int)


# Create CGPA (scale of 10) rounded to 2 decimal places
df['cgpa'] = ((df['total_marks'] / 300) * 10).round(2)

In [124]:
np.random.seed(42)
missing_indices = np.random.choice(df.index, size=40, replace=False)

df.loc[missing_indices[:20], 'cgpa'] = np.nan
df.loc[missing_indices[20:], 'total_marks'] = np.nan

In [125]:
outlier_indices = np.random.choice(df.index, size=10, replace=False)

df.loc[outlier_indices[:5], 'cgpa'] = 15.00     # Invalid high CGPA
df.loc[outlier_indices[5:], 'total_marks'] = 450  # Impossible total marks


In [126]:
df = df.rename(columns = {
    "race/ethnicity": "race",
    "parental level of education": "parent_edu",
    "test preparation course": "test_prep_course",
    "math score": "dbms_score",
    "reading score": "cns_score",
    "writing score": "os_score"
})

In [127]:
df.to_csv("StudentsPerformance_modified.csv", index=False)

In [128]:
dfm = pd.read_csv("StudentsPerformance_modified.csv")
dfm.head()

Unnamed: 0,gender,race,parent_edu,lunch,test_prep_course,dbms_score,cns_score,os_score,total_marks,cgpa
0,female,group B,bachelor's degree,standard,none,72,72,74,218.0,7.27
1,female,group C,some college,standard,completed,69,90,88,247.0,8.23
2,female,group B,master's degree,standard,none,90,95,93,278.0,9.27
3,male,group A,associate's degree,free/reduced,none,47,57,44,148.0,4.93
4,male,group C,some college,standard,none,76,78,75,229.0,7.63


In [129]:
dfm.dtypes

gender               object
race                 object
parent_edu           object
lunch                object
test_prep_course     object
dbms_score            int64
cns_score             int64
os_score              int64
total_marks         float64
cgpa                float64
dtype: object