In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('abalone.csv')
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


## Описательная статистика 

In [5]:
data.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


## Проверка на нормальность

### [Тест Шапиро-Уилк](https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test)

In [6]:
from scipy import stats

stat, p = stats.shapiro(data['Rings'])
print('Statistics=%.3f, p-value=%.3f' % (stat, p))

alpha = 0.05
if p > alpha:
    print('Принять гипотезу о нормальности')
else:
    print('Отклонить гипотезу о нормальности')

Statistics=0.931, p-value=0.000
Отклонить гипотезу о нормальности


## [Критерий согласия Пирсона ](https://ru.wikipedia.org/wiki/%D0%9A%D1%80%D0%B8%D1%82%D0%B5%D1%80%D0%B8%D0%B9_%D1%81%D0%BE%D0%B3%D0%BB%D0%B0%D1%81%D0%B8%D1%8F_%D0%9F%D0%B8%D1%80%D1%81%D0%BE%D0%BD%D0%B0)

In [7]:
stat, p = stats.normaltest(data['Length'])
print('Statistics=%.3f, p-value=%.3f' % (stat, p))

alpha = 0.05
if p > alpha:
    print('Принять гипотезу о нормальности')
else:
    print('Отклонить гипотезу о нормальности')

Statistics=242.159, p-value=0.000
Отклонить гипотезу о нормальности


## t-test

In [8]:
# Разделим на две выборки
half = len(data['Length']) / 2
sam1 = data.loc[:half, 'Length']
sam2 = data.loc[half:, 'Length']

stats.ttest_ind(sam2, sam1)

Ttest_indResult(statistic=1.5565212835974083, pvalue=0.11965998094160571)

In [9]:
dfs = (half - 1) + (half - 1)
scipy.stats.t.ppf(0.975, dfs)

1.9605323551806582

## Линейная регрессия

In [30]:
import statsmodels.formula.api as smf

model = smf.ols('Rings ~ Diameter', data=data)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,Rings,R-squared:,0.33
Model:,OLS,Adj. R-squared:,0.33
Method:,Least Squares,F-statistic:,2059.0
Date:,"Fri, 26 Jun 2020",Prob (F-statistic):,0.0
Time:,16:34:53,Log-Likelihood:,-9979.2
No. Observations:,4177,AIC:,19960.0
Df Residuals:,4175,BIC:,19980.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3186,0.173,13.423,0.000,1.980,2.657
Diameter,18.6699,0.411,45.371,0.000,17.863,19.477

0,1,2,3
Omnibus:,1414.851,Durbin-Watson:,0.967
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4737.552
Skew:,1.714,Prob(JB):,0.0
Kurtosis:,6.933,Cond. No.,11.8


In [31]:
res.resid

0       5.866905
1      -0.266103
2      -1.159940
3       0.866905
4      -0.079403
          ...   
4172    0.279962
4173   -0.533339
4174   -2.186786
4175   -1.373485
4176   -0.680380
Length: 4177, dtype: float64