In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# load the dataset
data = pd.read_csv('data\StudentsPerformance.csv')
print(data.head())
print(data.info())
data.isnull().sum()


   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtyp

  data = pd.read_csv('data\StudentsPerformance.csv')


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [11]:
data.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [16]:
# processing outliers
def load_and_clean_data(data):
    numeric_columns = ['math score', 'reading score', 'writing score']

    for column in numeric_columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # remove outliers
        data[column] = np.where(data[column] < lower_bound, lower_bound, data[column])
        data[column] = np.where(data[column] > upper_bound, upper_bound, data[column])
    print(f'\n Using way of IQR to process outliers')
    print(data.shape)

    data_save_path = 'data'
    file_path = os.path.join(data_save_path,'Cleaned StudentsPerformance.csv')

    if not os.path.exists(data_save_path):
        os.makedirs(data_save_path)
        print(f'Foler {data_save_path} created')
    else:
        print(f'Folder {data_save_path} already exists')

    data.to_csv(file_path, index= False)

    return data

if __name__ == "__main__":
    cleaned_data = load_and_clean_data(data)



 Using way of IQR to process outliers
(1000, 8)
Folder data already exists


In [30]:
def compute_descriptive_stats(data):
    # choose numeric columns
    numeric_df = data.select_dtypes(include=['float64'])

    # using pandas describe() method to compute descriptive statistics
    desc_stats = numeric_df.describe()

    # add other computing way: variance, mode,IQR
    variance = numeric_df.var().rename('variance')
    # mode might be more than 1, here we choose the first one 
    mode  =numeric_df.mode().iloc[0].rename('mode')

    # add new statistics in result table
    stats_df = pd.concat([desc_stats, variance.to_frame().T, mode.to_frame().T])
    
    # reverte the format, statistics as columns, easier to read
    stats_df = stats_df.transpose()

    # add IQR column
    stats_df['IQR'] = stats_df['75%'] - stats_df['25%']

    stats_df.to_csv('data\descriptive_statistics_table.csv')
    print(stats_df)

    return stats_df

if __name__ == '__main__':
    cleaned_data = pd.read_csv('data\Cleaned StudentsPerformance.csv')
    stats_tables = compute_descriptive_stats(cleaned_data)

                count       mean        std     min    25%   50%   75%    max  \
math score     1000.0  66.165000  14.922414  27.000  57.00  66.0  77.0  100.0   
reading score  1000.0  69.201000  14.503481  29.000  59.00  70.0  79.0  100.0   
writing score  1000.0  68.094375  15.068368  25.875  57.75  69.0  79.0  100.0   

                 variance  mode    IQR  
math score     222.678453  65.0  20.00  
reading score  210.350950  72.0  20.00  
writing score  227.055727  74.0  21.25  


  stats_df.to_csv('data\descriptive_statistics_table.csv')
  cleaned_data = pd.read_csv('data\Cleaned StudentsPerformance.csv')


Short Report: Key Insights from Visualizations
1. Math Score by Gender (box_math_score_by_gender.png)

Key Insight: The median math scores for males and females are very close, but the distribution of scores among males is more spread out. This means there is greater variability in math performance among male students, with both very high and very low scorers, while female students' scores are more concentrated.

2. Math Score by Test Preparation Course (box_math_score_by_test_preparation_course.png)

Key Insight: Students who completed the test preparation course show a significantly higher median math score and an overall better score distribution than those who did not. This suggests a strong association between taking the preparation course and improved math performance.

3. Reading Score by Gender (box_reading_score_by_gender.png)

Key Insight: Females demonstrate a higher median and a higher interquartile range in reading scores compared to males. This is a noticeable difference, indicating that, in this dataset, female students generally outperform males in reading.

4. Reading Score by Lunch (box_reading_score_by_lunch.png)

Key Insight: Students with a standard lunch have significantly higher reading scores than those with free/reduced lunch. This may imply that socioeconomic status is an important factor influencing academic performance.

5. Reading Score by Parental Level of Education (box_reading_score_by_parental_level_of_education.png)

Key Insight: A positive correlation is observed between parental education level and students' reading scores. Students whose parents have higher educational qualifications (e.g., bachelor's or master's degree) typically have higher median reading scores. This indicates that family educational background has a positive impact on student achievement.

6. Reading Score by Race/Ethnicity (box_reading_score_by_race_ethnicity.png)

Key Insight: Observable differences in reading scores exist among different racial/ethnic groups. The median score of certain groups (e.g., group E) is significantly higher than that of others (e.g., group A). This highlights an imbalance in performance distribution across demographic groups.

7. Reading Score by Test Preparation Course (box_reading_score_by_test_preparation_course.png)

Key Insight: Similar to the trend in math scores, students who completed the test preparation course also show a significant advantage in reading. This further validates the effectiveness of the preparatory course.

8. Distribution of Writing Score (hist_writing_score.png)

Key Insight: The distribution of writing scores is approximately normal (bell-shaped), with the majority of students scoring between 60 and 80 points. There are fewer students at the very high and very low ends of the spectrum.

9. Writing Score by Gender (box_writing_score_by_gender.png)

Key Insight: Consistent with the finding for reading scores, females also overall outperform males in writing, showing a higher median and a higher upper quartile range.

10. Writing Score by Test Preparation Course (box_writing_score_by_test_preparation_course.png)

Key Insight: The test preparation course also has a positive effect on writing scores. The score distribution of students who completed the course is shifted to the right (toward higher scores), indicating the course's benefit across all three subjects.

11. Correlation Heatmap (correlation_heatmap.png)

Key Insight: There is a very strong positive correlation (coefficients close to 1) between scores in math, reading, and writing, with the correlation between reading and writing being the highest. This means a student who performs well in one subject is highly likely to perform well in the others.

Overall Summary
This analysis reveals several key factors influencing student exam performance:

Test Preparation Course: This is the most consistent and effective factor in improving scores across all subjects.

Gender Differences: Clear differences exist; females show an advantage in reading and writing, while males show greater variability in math scores.

Socioeconomic Factors: Factors like lunch type and parental education level show a positive correlation with performance, suggesting that students with better resources tend to perform better.

Subject Correlation: The scores of the three core subjects (math, reading, writing) are highly correlated, indicating that student learning ability is comprehensive.

These insights can help educators develop targeted intervention strategies, such as promoting test preparation courses or providing additional support for specific student groups.

In [29]:
def create_visualization(df):
    # setting style
    sns.set_style('whitegrid')
    # create plots folder
    if not os.path.exists('plots'):
        os.makedirs('plots')

    # 1. single variable distribution : histogram直方图
    numeric_columns = ['math score', 'reading score', 'writing score']
    for column in numeric_columns:
        plt.Figure(figsize=(10,6))
        sns.histplot(data=df,x=column,kde=True)
        plt.title(f'Distribution of {column.title()}')
        plt.savefig(f'plots/hist_{column}.png')
        plt.close()

    # 2. single variable distribution - Grouped boxpolt箱线图(查看异常值)
    plt.Figure(figsize=(10,6))
    sns.boxplot(data=df,x = column)
    plt.title(f'Boxplot of {column.title()}')
    plt.savefig(f'plots/box_{column}.png')
    plt.close()

    # 3. multi variables relationships - scatter plot 散点图矩阵
    plt.figure(figsize=(8,5))
    sns.scatterplot(data=df, x='math score', y='reading score')
    plt.title('Math Score vs Reading Score')
    plt.savefig(f'plots/scatter_math_reading.png')
    plt.close()

    plt.figure(figsize=(8,5))
    sns.scatterplot(data=df, x='writing score', y='reading score')
    plt.title('Writing Score vs Reading Score')
    plt.savefig(f'plots/scatter_writing_reading.png')
    plt.close()

    plt.Figure(figsize=(14,10))
    sns.scatterplot(data=df,x='math score',y='reading score',hue='gender')
    plt.title('Math score vs. Reading score (by Gender)')
    plt.savefig('plots/scatter_math_reading_gender.png')
    plt.close()

    # 4. categories comparison - boxplot 分类比较 - 分组箱线图
    categorical_cols = ['gender','race/ethnicity', 'parental level of education','lunch','test preparation course']
    for cat_col in categorical_cols:
        for column in numeric_columns:
            plt.figure(figsize=(14,12))
            sns.boxplot(data=df,x=cat_col,y=column)
            plt.title(f'{column.title()} by {cat_col.title()}')
            plt.xticks(rotation = 45) # 防止x轴标签重叠
            plt.tight_layout() # 自动调整布局
            # 文件名替换掉空格和斜杠
            filename = f'plots/box_{column}_by_{cat_col.replace(' ','_').replace("/","_")}.png'
            plt.savefig(filename)
            plt.close()

    # 5. correlation heatmap
    plt.figure(figsize=(14,12))
    numeric_df = df.select_dtypes(include = [np.number])
    sns.heatmap(numeric_df.corr(),annot=True,cmap='coolwarm')
    plt.title('correlation heatmap')
    plt.savefig('plots/correlation_heatmap.png')
    plt.close()

    print('all plots are saved')

if __name__ in '__main__':
    cleaned_data = pd.read_csv('data\Cleaned StudentsPerformance.csv')
    create_visualization(cleaned_data)



  cleaned_data = pd.read_csv('data\Cleaned StudentsPerformance.csv')


all plots are saved
