单因子分析
- 集中趋势
    - 均值
    - 中位数、分位数
    - 众数
- 离中趋势
    - 标准差
    - 方差
- 数据分布
    - 偏态与峰态
        - 偏态:（数据偏移均值
        $$S=\frac{\frac{1}{n}\sum_{i=1}^{n}(x_i-\bar{x})^3}{\left( \frac{1}{n}\sum_{i=1}^{n}(x_i-\bar{x})^2 \right) ^{\frac{3}{2}}}$$
        - 峰态:（数据集中度
        $$S=\frac{\frac{1}{n}\sum_{i=1}^{n}(x_i-\bar{x})^4}{\left( \frac{1}{n}\sum_{i=1}^{n}(x_i-\bar{x})^2 \right)^2}$$ 
    - 正态分布与三大分布
- 抽样理论
    - 抽样误差
        - 重复抽样平均误差
        $$\mu_x=\sqrt{\frac{\sigma^2}{n}}$$
        - 不重复抽样平均误差
        $$\mu_x=\sqrt{\frac{\sigma^2}{n}\left(\frac{N-n}{N-1}\right)}$$
        - 重复抽样估计总体时抽样数目的确定
        $$n=\frac{Z_{\alpha/2}\delta^2}{\Delta^2}$$
        - 不重复抽样估计总体时抽样数目的确定
        $$n=\frac{NZ_{\alpha/2}\delta^2}{N\Delta^2+Z_{\alpha/2}\delta^2}$$
        (Z表示几个$\sigma$)
    - 抽样精度

In [1]:
import pandas as pd
df = pd.read_csv('./data/HR.csv')
df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
5,0.41,0.5,2,153,3,0,1,0,sales,low
6,0.1,0.77,6,247,4,0,1,0,sales,low
7,0.92,0.85,5,259,5,0,1,0,sales,low
8,0.89,1.0,5,224,5,0,1,0,sales,low
9,0.42,0.53,2,142,3,0,1,0,sales,low


In [2]:
type(df)

pandas.core.frame.DataFrame

In [3]:
type(df["satisfaction_level"])

pandas.core.series.Series

In [4]:
df.mean() # 均值

satisfaction_level         0.612834
last_evaluation            0.716102
number_project             3.803054
average_montly_hours     201.050337
time_spend_company         3.498233
Work_accident              0.144610
left                       0.238083
promotion_last_5years      0.021268
dtype: float64

In [5]:
type(df.mean())

pandas.core.series.Series

In [6]:
df["satisfaction_level"].mean()

0.6128335222348156

In [7]:
df.median() # 中位数

satisfaction_level         0.64
last_evaluation            0.72
number_project             4.00
average_montly_hours     200.00
time_spend_company         3.00
Work_accident              0.00
left                       0.00
promotion_last_5years      0.00
dtype: float64

In [8]:
df["satisfaction_level"].median()

0.64

In [9]:
df.quantile() # 分位数

satisfaction_level         0.64
last_evaluation            0.72
number_project             4.00
average_montly_hours     200.00
time_spend_company         3.00
Work_accident              0.00
left                       0.00
promotion_last_5years      0.00
Name: 0.5, dtype: float64

In [10]:
df.quantile(q=0.25) # 分位数

satisfaction_level         0.44
last_evaluation            0.56
number_project             3.00
average_montly_hours     156.00
time_spend_company         3.00
Work_accident              0.00
left                       0.00
promotion_last_5years      0.00
Name: 0.25, dtype: float64

In [11]:
df.mode() # 众数

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.1,0.55,4.0,135,3.0,0.0,0.0,0.0,sales,low
1,,,,156,,,,,,


In [12]:
df["satisfaction_level"].mode()

0    0.1
dtype: float64

In [13]:
df["Department"].mode()

0    sales
dtype: object

In [14]:
df.std() # 标准差

satisfaction_level        0.248631
last_evaluation           0.171169
number_project            1.232592
average_montly_hours     49.943099
time_spend_company        1.460136
Work_accident             0.351719
left                      0.425924
promotion_last_5years     0.144281
dtype: float64

In [15]:
df.skew() # 偏度

satisfaction_level      -0.476360
last_evaluation         -0.026622
number_project           0.337706
average_montly_hours     0.052842
time_spend_company       1.853319
Work_accident            2.021149
left                     1.230043
promotion_last_5years    6.636968
dtype: float64

In [16]:
df.kurt() # 峰度

satisfaction_level       -0.670859
last_evaluation          -1.239040
number_project           -0.495478
average_montly_hours     -1.134982
time_spend_company        4.773211
Work_accident             2.085320
left                     -0.487060
promotion_last_5years    42.054957
dtype: float64

In [17]:
import scipy.stats as ss

In [18]:
ss.norm

<scipy.stats._continuous_distns.norm_gen at 0x1814537eb8>

In [19]:
ss.norm.stats(moments = "mvsk")

(array(0.), array(1.), array(0.), array(0.))

In [20]:
ss.norm.pdf(0.0) # 标准正态分布的值

0.3989422804014327

In [21]:
ss.norm.ppf(0.9) # 取得该积分的值

1.2815515655446004

In [22]:
ss.norm.cdf(2) # 到该值的积分

0.9772498680518208

In [23]:
ss.norm.cdf(2)-ss.norm.cdf(-2)

0.9544997361036416

In [24]:
ss.norm.rvs(size=10)

array([-1.33270083,  1.64999087, -0.27491281,  0.97645065,  1.39910683,
       -0.34413145,  1.02626996,  0.54872163, -1.18237299, -0.17534995])

$\chi^2分布$

In [25]:
ss.chi2

<scipy.stats._continuous_distns.chi2_gen at 0x18145a4b00>

$t$分布

In [26]:
ss.t

<scipy.stats._continuous_distns.t_gen at 0x112b8c978>

$f$分布

In [27]:
ss.f

<scipy.stats._continuous_distns.f_gen at 0x18145b8438>

In [28]:
df.sample(n=10) # 抽样

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
11575,0.49,0.68,3,192,7,0,0,0,management,high
6639,0.77,0.71,3,134,3,0,0,0,marketing,medium
12960,0.96,0.7,3,207,3,0,0,0,IT,high
14304,0.46,0.55,2,129,3,0,1,0,sales,low
5174,0.8,0.97,5,258,3,0,0,0,IT,medium
5851,0.71,0.95,3,251,3,0,0,0,support,low
6507,0.72,0.63,3,223,2,1,0,0,sales,medium
1615,0.1,0.93,6,307,4,0,1,0,sales,low
9307,0.68,0.44,5,165,3,0,0,0,RandD,medium
5311,0.86,0.75,5,157,4,0,0,0,support,low


In [29]:
df.sample(frac=0.001)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
10739,0.69,0.75,5,196,3,0,0,0,support,medium
8596,0.97,0.94,3,160,3,0,0,0,support,low
3606,0.58,0.62,3,169,2,0,0,0,sales,medium
4543,0.76,0.52,2,229,3,0,0,1,technical,medium
2437,0.72,0.86,4,191,2,0,0,0,RandD,low
14679,0.9,0.92,5,245,5,0,1,0,sales,low
13441,0.99,0.84,4,142,10,0,0,0,technical,high
13758,0.42,0.8,4,259,7,1,0,0,sales,medium
8844,0.22,0.8,4,149,5,0,0,0,product_mng,medium
5790,0.59,0.57,4,261,2,0,0,0,IT,high


In [None]:
bh