In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm

# Q1 - Cutlets

In [2]:
cutlets=pd.read_csv('Cutlets.csv')
cutlets

Unnamed: 0,Unit A,Unit B
0,6.809,6.7703
1,6.4376,7.5093
2,6.9157,6.73
3,7.3012,6.7878
4,7.4488,7.1522
5,7.3871,6.811
6,6.8755,7.2212
7,7.0621,6.6606
8,6.684,7.2402
9,6.8236,7.0503


In [3]:
# Changing column names
cutlets.columns=["UnitA","UnitB"]

In [4]:
cutlets.UnitA

0     6.8090
1     6.4376
2     6.9157
3     7.3012
4     7.4488
5     7.3871
6     6.8755
7     7.0621
8     6.6840
9     6.8236
10    7.3930
11    7.5169
12    6.9246
13    6.9256
14    6.5797
15    6.8394
16    6.5970
17    7.2705
18    7.2828
19    7.3495
20    6.9438
21    7.1560
22    6.5341
23    7.2854
24    6.9952
25    6.8568
26    7.2163
27    6.6801
28    6.9431
29    7.0852
30    6.7794
31    7.2783
32    7.1561
33    7.3943
34    6.9405
Name: UnitA, dtype: float64

In [5]:
# Testing for Normality
print(stats.shapiro(cutlets.UnitA))

ShapiroResult(statistic=0.9649458527565002, pvalue=0.3199819028377533)


In [6]:
stats.shapiro(cutlets.UnitB)

ShapiroResult(statistic=0.9727300405502319, pvalue=0.5224985480308533)

##  2 sample t-test 

In [7]:
import scipy
scipy.stats.ttest_ind(cutlets.UnitA,cutlets.UnitB)

Ttest_indResult(statistic=0.7228688704678061, pvalue=0.4722394724599501)

### Result

In [8]:
# Since p-value>0.05, accept the null hypothesis.
# Both the samples have no significat difference in Diameter.

# Q2 - LABTAT

In [9]:
labtat=pd.read_csv('LabTAT.csv')
labtat

Unnamed: 0,Laboratory 1,Laboratory 2,Laboratory 3,Laboratory 4
0,185.35,165.53,176.70,166.13
1,170.49,185.91,198.45,160.79
2,192.77,194.92,201.23,185.18
3,177.33,183.00,199.61,176.42
4,193.41,169.57,204.63,152.60
...,...,...,...,...
115,178.49,170.66,193.80,172.68
116,176.08,183.98,215.25,177.64
117,202.48,174.54,203.99,170.27
118,182.40,197.18,194.52,150.87


In [10]:
labtat.columns=['Lab1','Lab2','Lab3','Lab4']
labtat

Unnamed: 0,Lab1,Lab2,Lab3,Lab4
0,185.35,165.53,176.70,166.13
1,170.49,185.91,198.45,160.79
2,192.77,194.92,201.23,185.18
3,177.33,183.00,199.61,176.42
4,193.41,169.57,204.63,152.60
...,...,...,...,...
115,178.49,170.66,193.80,172.68
116,176.08,183.98,215.25,177.64
117,202.48,174.54,203.99,170.27
118,182.40,197.18,194.52,150.87


In [11]:
# Test for Normality
print(stats.shapiro(labtat.Lab1))

ShapiroResult(statistic=0.9901824593544006, pvalue=0.5506953597068787)


In [12]:
print(stats.shapiro(labtat.Lab2))

ShapiroResult(statistic=0.9936322569847107, pvalue=0.8637524843215942)


In [13]:
print(stats.shapiro(labtat.Lab3))

ShapiroResult(statistic=0.9886345267295837, pvalue=0.4205053448677063)


In [14]:
print(stats.shapiro(labtat.Lab4))

ShapiroResult(statistic=0.9913753271102905, pvalue=0.6618951559066772)


In [15]:
# Testing for Variance
print(scipy.stats.levene(labtat.Lab1,labtat.Lab2))

LeveneResult(statistic=3.5495027780905763, pvalue=0.06078228171776711)


In [16]:
print(scipy.stats.levene(labtat.Lab2,labtat.Lab3))

LeveneResult(statistic=0.9441465124387124, pvalue=0.33220021420602397)


In [17]:
print(scipy.stats.levene(labtat.Lab3,labtat.Lab4))

LeveneResult(statistic=2.037958464521512, pvalue=0.15472618294425391)


In [18]:
print(scipy.stats.levene(labtat.Lab4,labtat.Lab1))

LeveneResult(statistic=1.5000140718506723, pvalue=0.22188001348277267)


## ANOVA Test

In [19]:
from statsmodels.formula.api import ols

In [20]:
mod=ols('labtat.Lab1~(labtat.Lab2+labtat.Lab3+labtat.Lab4)', data=labtat).fit()
aov=sm.stats.anova_lm(mod)
print(aov)

                df        sum_sq     mean_sq         F    PR(>F)
labtat.Lab2    1.0    332.030416  332.030416  1.940311  0.166299
labtat.Lab3    1.0    203.853111  203.853111  1.191271  0.277335
labtat.Lab4    1.0    265.614707  265.614707  1.552192  0.215323
Residual     116.0  19850.186366  171.122296       NaN       NaN


In [21]:
stats.f_oneway(labtat.iloc[:,0],labtat.iloc[:,1],labtat.iloc[:,2],labtat.iloc[:,3])

F_onewayResult(statistic=118.70421654401437, pvalue=2.1156708949992414e-57)

### Result

In [22]:
# Since all p-value > 0.05, we accept the null hypothesis, there is no sognificant difference in avg. TAT among laboratories

# Q3 - Buyer Ratio

In [23]:
br=pd.read_csv('BuyerRatio.csv')
br

Unnamed: 0,Observed Values,East,West,North,South
0,Males,50,142,131,70
1,Females,435,1523,1356,750


In [24]:
br1=br.rename({'Observed Values':'values'},axis=1)
br1

Unnamed: 0,values,East,West,North,South
0,Males,50,142,131,70
1,Females,435,1523,1356,750


In [25]:
# Creating Dummy Variables for "male" and 'female'
br1['values'][br1['values']=='Males']=0
br1['values'][br1['values']=='Females']=1
br1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  br1['values'][br1['values']=='Males']=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  br1['values'][br1['values']=='Females']=1


Unnamed: 0,values,East,West,North,South
0,0,50,142,131,70
1,1,435,1523,1356,750


## Chisquare Testing

In [26]:
import scipy
chisq_res=scipy.stats.chi2_contingency(br1)

In [27]:
chisq_res

(1.6929696469183673,
 0.7919942975413565,
 4,
 array([[8.81561238e-02, 4.27557201e+01, 1.46779946e+02, 1.31088156e+02,
         7.22880215e+01],
        [9.11843876e-01, 4.42244280e+02, 1.51822005e+03, 1.35591184e+03,
         7.47711978e+02]]))

In [28]:
Chisquare_Results=[['',"Test Statistics",'p-value'],['Sample Data',chisq_res[0],chisq_res[1]]]
Chisquare_Results

[['', 'Test Statistics', 'p-value'],
 ['Sample Data', 1.6929696469183673, 0.7919942975413565]]

### Result

In [29]:
# Since p-value>0.05, we accept the null hypothesis that the proportions for male and female are similar accross all regions

# Q4 - Customer Order Form

In [30]:
cof=pd.read_csv('Costomer+OrderForm.csv')
cof

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,Error Free,Error Free,Defective,Error Free
1,Error Free,Error Free,Error Free,Defective
2,Error Free,Defective,Defective,Error Free
3,Error Free,Error Free,Error Free,Error Free
4,Error Free,Error Free,Defective,Error Free
...,...,...,...,...
295,Error Free,Error Free,Error Free,Error Free
296,Error Free,Error Free,Error Free,Error Free
297,Error Free,Error Free,Defective,Error Free
298,Error Free,Error Free,Error Free,Error Free


In [31]:
cof.columns

Index(['Phillippines', 'Indonesia', 'Malta', 'India'], dtype='object')

In [32]:
# Creating Dummy Variables for 'Defectives' and 'Error Free'
cof['Phillippines'][cof['Phillippines']=='Error Free']=1
cof['Phillippines'][cof['Phillippines']=='Defective']=0
cof.head()

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,1,Error Free,Defective,Error Free
1,1,Error Free,Error Free,Defective
2,1,Defective,Defective,Error Free
3,1,Error Free,Error Free,Error Free
4,1,Error Free,Defective,Error Free


In [33]:
cof['Indonesia'][cof['Indonesia']=='Defective']=0
cof['Indonesia'][cof['Indonesia']=='Error Free']=1
cof['Malta'][cof['Malta']=='Defective']=0
cof['Malta'][cof['Malta']=='Error Free']=1
cof['India'][cof['India']=='Defective']=0
cof['India'][cof['India']=='Error Free']=1
cof.head()

Unnamed: 0,Phillippines,Indonesia,Malta,India
0,1,1,0,1
1,1,1,1,0
2,1,0,0,1
3,1,1,1,1
4,1,1,0,1


## Chisquare Testing

In [34]:
cof2=pd.Series.value_counts
cof2

<function pandas.core.base.IndexOpsMixin.value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True)>

In [35]:
cof2=cof.apply(cof2)
cof2

Unnamed: 0,Phillippines,Indonesia,Malta,India
1,271,267,269,280
0,29,33,31,20


In [36]:
chisq_res2=scipy.stats.chi2_contingency(cof2)
chisq_res2

(3.858960685820355,
 0.2771020991233135,
 3,
 array([[271.75, 271.75, 271.75, 271.75],
        [ 28.25,  28.25,  28.25,  28.25]]))

In [37]:
Chisquare_cof=[['','Test Sample','p-value'],['Sample Data',chisq_res2[0],chisq_res2[1]]]
Chisquare_cof

[['', 'Test Sample', 'p-value'],
 ['Sample Data', 3.858960685820355, 0.2771020991233135]]

### Result

In [38]:
# Since p-value>0.05, we accept null hypothesis, the defective % do not varies by centre

# Q5 - Fantaloons

In [39]:
fant=pd.read_csv('Faltoons.csv')
fant.head()

Unnamed: 0,Weekdays,Weekend
0,Male,Female
1,Female,Male
2,Female,Male
3,Male,Female
4,Female,Female


In [40]:
fant

Unnamed: 0,Weekdays,Weekend
0,Male,Female
1,Female,Male
2,Female,Male
3,Male,Female
4,Female,Female
...,...,...
395,Female,Male
396,Female,Female
397,Female,Female
398,Female,Male


In [41]:
# Creating Dummy Variables
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
fant['Weekdays']=label_encoder.fit_transform(fant['Weekdays'])
fant['Weekend']=label_encoder.fit_transform(fant['Weekend'])
fant.head()

Unnamed: 0,Weekdays,Weekend
0,1,0
1,0,1
2,0,1
3,1,0
4,0,0


In [42]:
fant.describe()

Unnamed: 0,Weekdays,Weekend
count,400.0,400.0
mean,0.2825,0.4175
std,0.450779,0.493764
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


# Chisquare Testing

In [43]:
count=fant.apply(pd.Series.value_counts)
count

Unnamed: 0,Weekdays,Weekend
0,287,233
1,113,167


In [44]:
chisq_result=scipy.stats.chi2_contingency(count)
chisq_result

(15.434065934065934,
 8.54342267020237e-05,
 1,
 array([[260., 260.],
        [140., 140.]]))

In [45]:
Chisquare_fant=[['','Test Result','p-value'],['Sample data',chisq_result[0],chisq_result[1]]]
Chisquare_fant

[['', 'Test Result', 'p-value'],
 ['Sample data', 15.434065934065934, 8.54342267020237e-05]]

### Result

In [46]:
# Since p-value < 0.05, we reject the null hypthesis, that % of males vs females differs based on day of week.