<https://www.scipy-lectures.org/packages/statistics/index.html>

In [1]:
import pandas

In [3]:
data = pandas.read_csv('./brain_size.csv', sep=';', na_values=".")

In [4]:
data

Unnamed: 0.1,Unnamed: 0,Gender,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
0,1,Female,133,132,124,118.0,64.5,816932
1,2,Male,140,150,124,,72.5,1001121
2,3,Male,139,123,150,143.0,73.3,1038437
3,4,Male,133,129,128,172.0,68.8,965353
4,5,Female,137,132,134,147.0,65.0,951545
5,6,Female,99,90,110,146.0,69.0,928799
6,7,Female,138,136,131,138.0,64.5,991305
7,8,Female,92,90,98,175.0,66.0,854258
8,9,Male,89,93,84,134.0,66.3,904858
9,10,Male,133,114,147,172.0,68.8,955466


In [5]:
import numpy as np
t = np.linspace(-6, 6, 20)
sin_t = np.sin(t)
cos_t = np.cos(t)

In [6]:
pandas.DataFrame({'t': t, 'sin': sin_t, 'cos': cos_t})

Unnamed: 0,cos,sin,t
0,0.96017,0.279415,-6.0
1,0.609977,0.792419,-5.368421
2,0.024451,0.999701,-4.736842
3,-0.570509,0.821291,-4.105263
4,-0.945363,0.326021,-3.473684
5,-0.955488,-0.29503,-2.842105
6,-0.596979,-0.802257,-2.210526
7,-0.008151,-0.999967,-1.578947
8,0.583822,-0.811882,-0.947368
9,0.950551,-0.310567,-0.315789


3.1.1.2.2. Manipulating data

In [7]:
data.shape    # 40 rows and 8 columns

(40, 8)

In [8]:
print(data['Gender'])  # Columns can be addressed by name

0     Female
1       Male
2       Male
3       Male
4     Female
5     Female
6     Female
7     Female
8       Male
9       Male
10    Female
11      Male
12      Male
13    Female
14    Female
15    Female
16    Female
17      Male
18    Female
19      Male
20      Male
21      Male
22    Female
23      Male
24    Female
25      Male
26    Female
27      Male
28    Female
29    Female
30    Female
31      Male
32      Male
33      Male
34    Female
35    Female
36      Male
37    Female
38      Male
39      Male
Name: Gender, dtype: object


In [9]:
data[data['Gender'] == 'Female']['VIQ'].mean()

109.45

In [10]:
groupby_gender = data.groupby('Gender')
for gender, value in groupby_gender['VIQ']:
    print((gender, value.mean()))

('Female', 109.45)
('Male', 115.25)


In [11]:
groupby_gender.mean()

Unnamed: 0_level_0,Unnamed: 0,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Female,19.65,111.9,109.45,110.45,137.2,65.765,862654.6
Male,21.35,115.0,115.25,111.6,166.444444,71.431579,954855.4


3.1.1.2.3. Plotting data

In [13]:
from pandas.tools import plotting
plotting.scatter_matrix(data[['Weight', 'Height', 'MRI_Count']])



array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10b49d850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10b5791d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10dae1610>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10db4e350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10dbcf6d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10db6fc50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10dcc1350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10de511d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10deb6890>]], dtype=object)

In [14]:
plotting.scatter_matrix(data[['PIQ', 'VIQ', 'FSIQ']]) 

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x10e6ea8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10e759ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10e7aa2d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f0687d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1043d6350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f34e490>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x10f3d2310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f435fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10f4b7e50>]], dtype=object)

In [15]:
from scipy import stats

In [16]:
stats.ttest_1samp(data['VIQ'], 0)

Ttest_1sampResult(statistic=30.088099970849328, pvalue=1.3289196468728067e-28)

In [17]:
female_viq = data[data['Gender'] == 'Female']['VIQ']
male_viq = data[data['Gender'] == 'Male']['VIQ']
stats.ttest_ind(female_viq, male_viq) 

Ttest_indResult(statistic=-0.77261617232750113, pvalue=0.44452876778583217)

In [18]:
stats.ttest_ind(data['FSIQ'], data['PIQ'])

Ttest_indResult(statistic=0.46563759638096403, pvalue=0.64277250094148408)

In [19]:
import numpy as np
x = np.linspace(-5, 5, 20)
np.random.seed(1)
# normal distributed noise
y = -5 + 3*x + 4 * np.random.normal(size=x.shape)
# Create a data frame containing all the relevant variables
data = pandas.DataFrame({'x': x, 'y': y})

In [24]:
from statsmodels.formula.api import ols
model = ols("y ~ x", data).fit()

In [25]:
print(model.summary()) 

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.804
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     74.03
Date:                Sat, 01 Oct 2016   Prob (F-statistic):           8.56e-08
Time:                        18:14:25   Log-Likelihood:                -57.988
No. Observations:                  20   AIC:                             120.0
Df Residuals:                      18   BIC:                             122.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -5.5335      1.036     -5.342      0.0