#### package import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import scipy.stats as stats

#### import the data

In [3]:
cust = pd.read_csv('D:/Sampledata/cust_seg.csv')

#### Metadata nd data inspection

In [4]:
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   custid               200 non-null    int64  
 1   sex                  200 non-null    int64  
 2   AqChannel            200 non-null    int64  
 3   region               200 non-null    int64  
 4   Marital_status       200 non-null    int64  
 5   segment              200 non-null    int64  
 6   pre_usage            200 non-null    int64  
 7   Post_usage_1month    200 non-null    int64  
 8   Latest_mon_usage     200 non-null    float64
 9   post_usage_2ndmonth  200 non-null    float64
dtypes: float64(2), int64(8)
memory usage: 15.8 KB


In [5]:
cust.head()

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
0,70,0,4,1,1,1,57,52,49.2,57.2
1,121,1,4,2,1,3,68,59,63.6,64.9
2,86,0,4,3,1,1,44,33,64.8,36.3
3,141,0,4,3,1,3,63,44,56.4,48.4
4,172,0,4,2,1,2,47,52,68.4,57.2


In [6]:
cust.nunique()

custid                 200
sex                      2
AqChannel                4
region                   3
Marital_status           2
segment                  3
pre_usage               30
Post_usage_1month       29
Latest_mon_usage        40
post_usage_2ndmonth     29
dtype: int64

In [8]:
cust.columns

Index(['custid', 'sex', 'AqChannel', 'region', 'Marital_status', 'segment',
       'pre_usage', 'Post_usage_1month', 'Latest_mon_usage',
       'post_usage_2ndmonth'],
      dtype='object')

#### Q6 - Corelations

In [9]:
stats.pearsonr( cust.pre_usage, cust.Latest_mon_usage )

(0.6622801251558603, 1.2767419295069521e-26)

#### Q5 - chi square test

In [12]:
obs_freq = pd.crosstab( cust.region, cust.segment )

In [14]:
obs_freq

segment,1,2,3
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,16,19,12
2,20,44,31
3,9,42,7


In [13]:
stats.chi2_contingency( obs_freq )

(16.604441649489342,
 0.0023066300908054713,
 4,
 array([[10.575, 24.675, 11.75 ],
        [21.375, 49.875, 23.75 ],
        [13.05 , 30.45 , 14.5  ]]))

#### Q4 - ftest or ANOVA

In [15]:
cust.segment.nunique()

3

In [16]:
cust.segment.value_counts()

2    105
3     50
1     45
Name: segment, dtype: int64

In [30]:
cust.columns

Index(['custid', 'sex', 'AqChannel', 'region', 'Marital_status', 'segment',
       'pre_usage', 'Post_usage_1month', 'Latest_mon_usage',
       'post_usage_2ndmonth'],
      dtype='object')

In [34]:
usage = 'Latest_mon_usage'

In [35]:
# data processing for the test
s1 = cust.loc[ cust.segment == 1, usage ]
s2 = cust.loc[ cust.segment == 2, usage ]
s3 = cust.loc[ cust.segment == 3, usage ]

print( 'mean s1:', s1.mean(), '| mean s2:', s2.mean(), '| mean s3:', s3.mean() )

mean s1: 60.026666666666685 | mean s2: 68.08000000000003 | mean s3: 55.703999999999986


In [36]:
stats.f_oneway( s1, s2, s3 )

F_onewayResult(statistic=29.279283801321778, pvalue=7.36401083352674e-12)

#### Q1: ttest, 1sample ttest

In [37]:
cust.Post_usage_1month.mean()

52.775

In [39]:
stats.ttest_1samp( cust.Post_usage_1month, 50 )

Ttest_1sampResult(statistic=4.140324966963024, pvalue=5.120919460716355e-05)

#### Q2: ttest, paired sample ttest or relational ttest

In [40]:
print( 'mean of pre usage:', cust.pre_usage.mean() )
print( 'mean of post 1month usage:', cust.Post_usage_1month.mean() )

mean of pre usage: 52.23
mean of post 1month usage: 52.775


In [42]:
stats.ttest_rel( cust.pre_usage, cust.Post_usage_1month )

Ttest_relResult(statistic=-0.8673065458794775, pvalue=0.3868186820914985)

#### Q3: ttest, independent sample ttest

In [45]:
cust.sex.value_counts()

1    109
0     91
Name: sex, dtype: int64

In [46]:
cust.columns

Index(['custid', 'sex', 'AqChannel', 'region', 'Marital_status', 'segment',
       'pre_usage', 'Post_usage_1month', 'Latest_mon_usage',
       'post_usage_2ndmonth'],
      dtype='object')

In [51]:
usage = 'Post_usage_1month'

male_spend = cust.loc[ cust.sex == 0, usage ]
female_spend = cust.loc[ cust.sex == 1, usage ]

print( 'mean of male spend: ', male_spend.mean(), '| mean of female spend: ', female_spend.mean() )

mean of male spend:  50.120879120879124 | mean of female spend:  54.99082568807339


In [52]:
stats.ttest_ind( male_spend, female_spend )

Ttest_indResult(statistic=-3.7340738531536797, pvalue=0.00024625461203549315)

In [53]:
stats.f_oneway( male_spend, female_spend )

F_onewayResult(statistic=13.94330754080599, pvalue=0.0002462546120354903)

In [55]:
cust['NewCol'] = np.where( cust.region == 1, 'N', 'O')

In [56]:
cust

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth,NewCol
0,70,0,4,1,1,1,57,52,49.2,57.2,N
1,121,1,4,2,1,3,68,59,63.6,64.9,O
2,86,0,4,3,1,1,44,33,64.8,36.3,O
3,141,0,4,3,1,3,63,44,56.4,48.4,O
4,172,0,4,2,1,2,47,52,68.4,57.2,O
...,...,...,...,...,...,...,...,...,...,...,...
195,31,1,2,2,2,1,55,59,62.4,64.9,O
196,145,1,4,2,1,3,42,46,45.6,50.6,O
197,187,1,4,2,2,1,57,41,68.4,45.1,O
198,118,1,4,2,1,1,55,62,69.6,68.2,O
