In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Library to help with statistical analysis
import scipy.stats as stats

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Python Course/Business Statistics/Business Statistics Hand-on Quiz/students.csv')

In [3]:
# copying data to another variable to avoid any changes to original data
df = data.copy()

In [4]:
# looking at head (5 observations)
df.head()

Unnamed: 0,gender,race,parental_level_of_education,taken_test_preparation_course,maths_score,reading_score,writing_score,overall_score
0,male,Race C,high school,no,73,74,74,73.67
1,female,Race A,high school,no,71,62,68,67.0
2,male,Race B,bachelor's degree,yes,87,92,89,89.33
3,female,Race D,high school,no,63,62,58,61.0
4,female,Race A,high school,no,60,66,66,64.0




*   Each row contains the gender of the student, their race, the level of education of their parent, information on whether they have taken a prep course, their maths score, their reading score, their writing score, and their overal score.



In [5]:
df.shape

(100, 8)



*   The data has 100 rows and 8 columns.


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   gender                         100 non-null    object 
 1   race                           100 non-null    object 
 2   parental_level_of_education    100 non-null    object 
 3   taken_test_preparation_course  100 non-null    object 
 4   maths_score                    100 non-null    int64  
 5   reading_score                  100 non-null    int64  
 6   writing_score                  100 non-null    int64  
 7   overall_score                  100 non-null    float64
dtypes: float64(1), int64(3), object(4)
memory usage: 6.4+ KB




*   The gender, race, parental_level_of_education, and taken_test_preparation_course are categorical while maths_score, reading_score, writing_score, and overall_score are numerical
*   There are no missing values



In [7]:
df.describe()

Unnamed: 0,maths_score,reading_score,writing_score,overall_score
count,100.0,100.0,100.0,100.0
mean,68.45,70.8,69.65,69.6335
std,14.240627,13.586238,13.567173,13.099381
min,36.0,35.0,33.0,38.0
25%,57.75,61.75,60.75,61.0
50%,69.0,71.0,69.5,69.33
75%,78.25,81.0,80.0,78.7525
max,100.0,100.0,100.0,99.0




*   On average, the students score 68.45 on maths, 70.8 on reading, 69.65 on writing, and 69.63 overall
*   The overall score ranged from 38.00 to 99.00
*   Half of the students score at least 69.33 overall
*   Since the mean and median are close, there is a strong indication that the distribution of overall score is approximately normal
*   The variation of score is similar across all four numerical variables though slightly larger than others for math scores






In [8]:
#z-score for maths scores
df['z-math'] = (df['maths_score']-df['maths_score'].mean())/df['maths_score'].std()

In [11]:
df.head()

Unnamed: 0,gender,race,parental_level_of_education,taken_test_preparation_course,maths_score,reading_score,writing_score,overall_score,z-math
0,male,Race C,high school,no,73,74,74,73.67,0.319508
1,female,Race A,high school,no,71,62,68,67.0,0.179065
2,male,Race B,bachelor's degree,yes,87,92,89,89.33,1.302611
3,female,Race D,high school,no,63,62,58,61.0,-0.382708
4,female,Race A,high school,no,60,66,66,64.0,-0.593373


In [12]:
#z-score for reading scores
df['z-reading'] = (df['reading_score']-71)/13

In [13]:
#z-score for writing scores
df['z-writing'] = (df['writing_score']-69)/10

In [14]:
df.head()

Unnamed: 0,gender,race,parental_level_of_education,taken_test_preparation_course,maths_score,reading_score,writing_score,overall_score,z-math,z-reading,z-writing
0,male,Race C,high school,no,73,74,74,73.67,0.319508,0.230769,0.5
1,female,Race A,high school,no,71,62,68,67.0,0.179065,-0.692308,-0.1
2,male,Race B,bachelor's degree,yes,87,92,89,89.33,1.302611,1.615385,2.0
3,female,Race D,high school,no,63,62,58,61.0,-0.382708,-0.692308,-1.1
4,female,Race A,high school,no,60,66,66,64.0,-0.593373,-0.384615,-0.3


In [16]:
from scipy.stats import norm

z_reading_90 = (90-71)/13


1 - norm.cdf(z_reading_90)

0.07193386424080761

I did the following wrong, going the z distribution route since I don't know the population standard deviation but I was lucky since the result was close to the correct result.

In [17]:
# construct the confidence interval for overall scores
np.round(norm.interval(0.95, loc=df['overall_score'].mean(), scale=df['overall_score'].std() / np.sqrt(100)), 2)

array([67.07, 72.2 ])


It will be more accurate to use t distribution.



In [25]:
# import the required function
from scipy.stats import t

# set the values of sample mean and sample standard deviation
x_bar, s =df['overall_score'].mean(), df['overall_score'].std()

# set the value of sample size and degrees of freedom
n = len(df)
k = n - 1

# construct the confidence interval
np.round(t.interval(0.95,df=k, loc=x_bar, scale=s / np.sqrt(n)), 2)

array([67.03, 72.23])

In [18]:
#using 1 sample t test to verify whether average overall score is over 70
t_stat, p_value = stats.ttest_1samp(df['overall_score'], 70, alternative='greater') # alternative argument is used to specify the tail of the test.

print("Test Statistic =",t_stat)
print("p-value =",p_value)

Test Statistic = -0.27978421600438497
p-value = 0.6098866036964306


In [21]:
# t test for two independent samples
t, p_value = stats.ttest_ind(df[df['taken_test_preparation_course']=='no']['maths_score'], df[df['taken_test_preparation_course']=='yes']['maths_score'], alternative='less')
print("tstat = ",t, ", p_value = ", p_value)

tstat =  -5.176999256024453 , p_value =  6.010315522311589e-07


In [22]:
# prepare a contingency table between taking test preparation and parents educational level
contingency_table = pd.crosstab(df['taken_test_preparation_course'], df['parental_level_of_education'])
contingency_table

parental_level_of_education,associate's degree,bachelor's degree,high school,master's degree,some high school
taken_test_preparation_course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,6,5,40,5,15
yes,10,5,9,2,3


In [23]:
# import the required test function
from scipy.stats import chi2_contingency

# calculate the test results
chi, p_value, dof, expected = chi2_contingency(contingency_table)

print("Test Statistic =",chi)
print("p-value =",p_value)
print("Degrees of freedom =",dof)
print("Expected frequencies \n", expected)

Test Statistic = 14.883389003974589
p-value = 0.004949288534412502
Degrees of freedom = 4
Expected frequencies 
 [[11.36  7.1  34.79  4.97 12.78]
 [ 4.64  2.9  14.21  2.03  5.22]]


In [24]:
# import the required function for anova test of relative performance in reading, writing, and maths
from scipy.stats import f_oneway

# find the p-value
test_stat, p_value = f_oneway(df['reading_score'], df['writing_score'], df['maths_score'])
print('The p-value is ', p_value)

The p-value is  0.4852226859144665
