In [44]:
# importing libraries
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [45]:
#Reading the file into python as dataframe
machine_df = pd.read_csv("machine.txt", encoding='utf=16', sep= '\t')
machine_df

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


In [46]:
# Standardizing header-column names in the dataframe by using only lowercase letters and by replacing the spaces with underscores
machine_df.columns = [machine_df.columns[i].lower().replace('    ','').replace(' ', '_') for i in range(len(machine_df.columns))]
list(machine_df.columns)

['new_machine', 'old_machine']

In [47]:
# the observations per machine in 2 separate variables:
new_machine = machine_df['new_machine']
old_machine = machine_df['old_machine']

In [48]:

# Let's describe the data, we will keep the means and the standard deviations of the machines.
machine_df.describe()
# The New machine's mean is 42.14 and its standard deviation is 0.68
# The Old machine's mean is 43.23 and its standard deviation is 0.75, approximately

Unnamed: 0,new_machine,old_machine
count,10.0,10.0
mean,42.14,43.23
std,0.683455,0.749889
min,41.0,41.7
25%,41.8,42.8
50%,42.2,43.4
75%,42.625,43.75
max,43.2,44.1


In [49]:
# n , mean and standard deviation of the first sample (new_machine)
print('n:', len(new_machine),', mean:', np.mean(new_machine), ', standard deviation: ',np.std(new_machine))

n: 10 , mean: 42.14 , standard deviation:  0.6483826030978941


In [50]:
# n , mean and standard deviation of the second sample (old_machine)
print('n:', len(old_machine), ', mean:', np.mean(old_machine), ', standard deviation: ',np.std(old_machine))

n: 10 , mean: 43.230000000000004 , standard deviation:  0.7114070564732956


In [51]:
print("New machine's mean is {:.2f}".format(np.mean(new_machine)))
print("Old machine's mean is {:.2f}".format(np.mean(old_machine)))
print("New machine's standard deviation is {:.2f}".format(np.std(new_machine)))
print("Old machine's standard deviation is {:.2f}".format(np.std(old_machine)))

New machine's mean is 42.14
Old machine's mean is 43.23
New machine's standard deviation is 0.65
Old machine's standard deviation is 0.71


In [52]:
# the Pooled Standard Deviation is:
SDpooled_numerator = ( len(new_machine) - 1 ) * ( np.std(new_machine)**2 ) +  ( len(old_machine) - 1 ) * ( np.std(old_machine)**2 )
SDpooled_no_root = SDpooled_numerator/( len(new_machine) + len(old_machine) - 2)
SDpooled = np.sqrt(SDpooled_no_root)

In [53]:
# the Statistical value t is:
t=(np.mean(new_machine)-np.mean(old_machine))/np.sqrt((SDpooled**2)/(len(new_machine)-1)+(SDpooled**2)/(len(old_machine)-1))

In [54]:
print("The t statistic is: {:.2f}".format(t))

The t statistic is: -3.40


In [55]:
# Percent point function
Zc = st.t.ppf(1-(0.05),df = len(new_machine)+len(old_machine)-2)
Zc

1.7340636066175354

In [56]:
# or easily, we use alternative='less' because it is an one-tailed test
from scipy.stats import ttest_ind, norm
ttest_ind(new_machine, old_machine, alternative='less')

Ttest_indResult(statistic=-3.3972307061176026, pvalue=0.0016055712503872579)

In [57]:

# we can notice that p-value is pvalue=0.0016055712503872579< a=0.05
# So, we reject the null hypothesis and we accept the alternative hypothesis that
# H1 (or Ha): μ < μ0 or that the new machine packs faster on the average than the machine currently used