# **ANALYZING THE STUDENT PERFORMANCE AND FINDING HOW VARIOUS FACTORS AFFECT IT**

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
from google.colab import files
uploaded= files.upload()

Saving StudentsPerformance.csv to StudentsPerformance.csv


In [3]:
df= pd.read_csv("StudentsPerformance.csv")

In [4]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
print(df.describe())
print(" ")
df.info()

       math score  reading score  writing score
count  1000.00000    1000.000000    1000.000000
mean     66.08900      69.169000      68.054000
std      15.16308      14.600192      15.195657
min       0.00000      17.000000      10.000000
25%      57.00000      59.000000      57.750000
50%      66.00000      70.000000      69.000000
75%      77.00000      79.000000      79.000000
max     100.00000     100.000000     100.000000
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   

In [6]:
# renaming columns

df= df.rename(columns={'race/ethnicity': 'ethnicity',
    'parental level of education': 'parentEdu',
    'test preparation course': 'testPrep',
    'math score': 'mathScore',
    'reading score': 'readingScore',
    'writing score': 'writingScore'})
df.head()

Unnamed: 0,gender,ethnicity,parentEdu,lunch,testPrep,mathScore,readingScore,writingScore
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
# score averages
mathAvg= df['mathScore'].mean()
print(mathAvg)
readingAvg= df['readingScore'].mean()
print(readingAvg)
writingAvg= df['writingScore'].mean()
writingAvg

66.089
69.169


np.float64(68.054)

In [8]:
df.groupby('gender')[['mathScore','readingScore','writingScore']].mean()

Unnamed: 0_level_0,mathScore,readingScore,writingScore
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,72.608108,72.467181
male,68.728216,65.473029,63.311203


In [9]:
# Null Hypothesis (H₀): There is no difference in math scores between male and female students.

# Alternative Hypothesis (H₁): There is a difference in math scores between male and female students.

print("Gender vs math score")

femalemath= df[df['gender']=='female']['mathScore']
malemath= df[df['gender']=='male']['mathScore']

t_stat,p_val= stats.ttest_ind(femalemath, malemath)
print(t_stat)
print(p_val)

# Null Hypothesis (H₀): There is no difference in reading scores between male and female students.

# Alternative Hypothesis (H₁): There is a difference in reading scores between male and female students.

print("Gender vs reading score")
femalereading= df[df['gender']=='female']['readingScore']
malereading= df[df['gender']=='male']['readingScore']

t_stat_r, p_val_r= stats.ttest_ind(femalereading,malereading)
print(t_stat_r)
print(p_val_r)

# Null Hypothesis (H₀): There is no difference in writing scores between male and female students.

# Alternative Hypothesis (H₁): There is a difference in writing scores between male and female students.

print("Gender vs writng score")
femalewriting= df[df['gender']=='female']['writingScore']
malewriting= df[df['gender']=='male']['writingScore']

t_stat_w, p_val_w= stats.ttest_ind(femalewriting, malewriting)
print(t_stat_w)
print(p_val_w)

Gender vs math score
-5.383245869828983
9.120185549328822e-08
Gender vs reading score
7.959308005187657
4.680538743933289e-15
Gender vs writng score
9.979557910004507
2.019877706867934e-22


we reject the null hypothesis for all three cased


In [10]:
# lunch type vs math score

standard= df[df['lunch']=='standard']['mathScore']
free= df[df['lunch']=='free/reduced']['mathScore']

# t_stat,p_val=stats.ttest_ind(standard, free)
# print(t_stat)
# print(p_val)
f_stat, p_val_anova= stats.f_oneway(standard, free)
print(f_stat)
print(p_val_anova)

standard= df[df['lunch']=='standard']['readingScore']
free= df[df['lunch']=='free/reduced']['readingScore']

# t_stat,p_val=stats.ttest_ind(standard, free)
# print(t_stat)
# print(p_val)

f_stat, p_val_anova= stats.f_oneway(standard, free)
print(f_stat)
print(p_val_anova)

standard= df[df['lunch']=='standard']['writingScore']
free= df[df['lunch']=='free/reduced']['writingScore']

# t_stat,p_val=stats.ttest_ind(standard, free)
# print(t_stat)
# print(p_val)

f_stat, p_val_anova= stats.f_oneway(standard, free)
print(f_stat)
print(p_val_anova)

print(standard.mean())
print(free.mean())

140.1188415483512
2.4131955993147374e-30
55.51824248147809
2.0027966545286206e-13
64.15664289588337
3.1861895831680456e-15
70.8232558139535
63.02253521126761


In [11]:
df['parentEdu'] = df['parentEdu'].str.strip()

In [12]:
group= df['parentEdu'].unique()

In [13]:
df['parentEdu'] = df['parentEdu'].map({
    "some high school": "HS",
    "high school": "HS",
    "associate's degree": "AD",
    "some college": "College",
    "bachelor's degree": "UG",
    "master's degree": "PG"
})

In [24]:
# Create a new column for average score
df['avgScore'] = df[['mathScore', 'readingScore', 'writingScore']].mean(axis=1)


In [15]:
edu_math= [df[df['parentEdu']==grp]['avgScore'] for grp in group]
f_stat, p_val_anova= stats.f_oneway(*edu_math)
print(f_stat)
print(p_val_anova)
print(df.groupby(['parentEdu'])['avgScore'].mean().sort_values(ascending=False))

nan
nan
parentEdu
PG         73.598870
UG         71.923729
AD         69.569069
College    68.476401
HS         64.056889
Name: avgScore, dtype: float64


  f_stat, p_val_anova= stats.f_oneway(*edu_math)


In [16]:
# tukey's hsd test

from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey= pairwise_tukeyhsd(endog=df['avgScore'],groups=df['parentEdu'],alpha=0.05)
print(tukey)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower   upper  reject
------------------------------------------------------
     AD College  -1.0927 0.9213 -4.6894  2.5041  False
     AD      HS  -5.5122    0.0 -8.7355 -2.2889   True
     AD      PG   4.0298 0.2788 -1.5453  9.6049  False
     AD      UG   2.3547 0.5732 -1.9817   6.691  False
College      HS  -4.4195 0.0016 -7.6248 -1.2142   True
College      PG   5.1225  0.088 -0.4423 10.6872  False
College      UG   3.4473 0.1884 -0.8757  7.7703  False
     HS      PG    9.542    0.0   4.211 14.8729   True
     HS      UG   7.8668    0.0  3.8492 11.8845   True
     PG      UG  -1.6751 0.9434 -7.7442  4.3939  False
------------------------------------------------------


There is a statistically significant difference between the average scores of students whose parents are in group1 vs group2.

Examples:  
Students whose parents have high school education (HS) score significantly lower than those with:  
Associate Degree (AD)  
College  
Postgraduate (PG)  
Undergraduate (UG)

The largest positive difference is between HS and PG (9.54 points) → kids of postgraduates do better than kids of just high school graduates.

In [17]:
df['testPrep'].unique()

array(['none', 'completed'], dtype=object)

In [18]:
none= df[df['testPrep']=='none']['avgScore']
completed= df[df['testPrep']=='completed']['avgScore']

t_stat,p_val=stats.ttest_ind(none, completed)
print(t_stat)
print(p_val)

df.groupby(df['testPrep'])['avgScore'].mean().sort_values(ascending= False)

-8.390944443482592
1.633780203592351e-16


Unnamed: 0_level_0,avgScore
testPrep,Unnamed: 1_level_1
completed,72.66946
none,65.038941


Yes, completing the test prep significantly improves performance.
The p-value < 0.05, so the result is statistically significant.
On average, students who completed the prep score ~7.6 points higher

In [19]:
from scipy.stats import chi2_contingency

In [20]:
observed= pd.crosstab(df['gender'],df['ethnicity'])

chi_stat= stats.chi2_contingency(observed)
print(chi_stat)

Chi2ContingencyResult(statistic=np.float64(9.02738626908596), pvalue=np.float64(0.06041858784847785), dof=4, expected_freq=array([[ 46.102,  98.42 , 165.242, 135.716,  72.52 ],
       [ 42.898,  91.58 , 153.758, 126.284,  67.48 ]]))


There is no statistically significant relationship between race and gender at the 5% level.

In [21]:
observed= pd.crosstab(df['gender'], df['testPrep'])

chi_stat= stats.chi2_contingency(observed)
print(chi_stat)

Chi2ContingencyResult(statistic=np.float64(0.015529201882465888), pvalue=np.float64(0.9008273880804724), dof=1, expected_freq=array([[185.444, 332.556],
       [172.556, 309.444]]))


There is no significant relationship between gender and test prep course completion.


In [22]:
observed= pd.crosstab(df['parentEdu'], df['ethnicity'])

chi_stat= stats.chi2_contingency(observed)
print(chi_stat)

Chi2ContingencyResult(statistic=np.float64(25.076982315941653), pvalue=np.float64(0.06847978165354146), dof=16, expected_freq=array([[ 19.758,  42.18 ,  70.818,  58.164,  31.08 ],
       [ 20.114,  42.94 ,  72.094,  59.212,  31.64 ],
       [ 33.375,  71.25 , 119.625,  98.25 ,  52.5  ],
       [  5.251,  11.21 ,  18.821,  15.458,   8.26 ],
       [ 10.502,  22.42 ,  37.642,  30.916,  16.52 ]]))


In [25]:
obs= pd.crosstab(df['parentEdu'], df['gender'])

chi_stat= stats.chi2_contingency(obs)
print(chi_stat)

Chi2ContingencyResult(statistic=np.float64(3.0743590235347606), pvalue=np.float64(0.5454596331580355), dof=4, expected_freq=array([[114.996, 107.004],
       [117.068, 108.932],
       [194.25 , 180.75 ],
       [ 30.562,  28.438],
       [ 61.124,  56.876]]))


1. Do male and female score differently?

    Yes, there is a significant difference in scores by gender:
    Males tend to score higher in math
    Females tend to score higher in reading and writing.
2. Does type of lunch influence score?

    Yes, there is a statistically significant difference in all three scores based on lunch type.Students with standard lunch performed better on average.
3. Does parent level education affect child's performance?

    Yes, there is a statistically significant difference in student performance. Those whose parents have higher education, score better.
4.  Is test prep course completion necessary for good score?
  
    Yes, completing the test prep significantly improves performance. The p-value < 0.05, so the result is statistically significant. On average, students who completed the prep score ~7.6 points higher.
5. Are race and gender of students related in some way?
    
    There is no statistically significant relationship between race and gender at the 5% level.
6. Which gender is more likely to complete the test prep course?

    There is no statistically significant relationship between test prep completion and gender at the 5% level.
7. Which subject has the highes avg?

    Reading


8. Is there a relation between the parent level  of education and the gender of student ?

    There is no statistically significant relationship between parent level  of education and the gender of student at the 5% level.

9. Is there a relation between the parent level  of education and the ethnicity of student ?

    There is no statistically significant relationship between parent level  of education and the ethnicity of student at the 5% level

