In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline

In [None]:
project = 'kaggle_titanic'

import os.path
import sys
current_dir = os.path.abspath('./')
project_dir = current_dir[:current_dir.rfind(project)+len(project)+1]
sys.path.insert(0, project_dir)

In [None]:
train_path = project_dir + 'data/raw/train.csv'
test_path = project_dir + 'data/raw/test.csv'

In [None]:
train_df = pd.DataFrame.from_csv(train_path)
test_df = pd.DataFrame.from_csv(test_path)

# statistical analysis

In [None]:
df = train_df
category_columns = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for col in category_columns:
    df[col] = df[col].astype('category')

ratio of data/features: >5

In [None]:
len(df)/len(df.columns)

## missing data
http://www.stat.ncsu.edu/research/biostat/impute.ps,
http://www.bu.edu/sph/files/2014/05/Marina-tech-report.pdf, http://www.statsmodels.org/dev/imputation.html

check for
- MCAR (missing completely at random)
- MAR
- NMAR

methods:
- marginal mean imputation
- conditional mean imputation
- Multiple imputation
- Maximum likelihood
- Bayesian simulation
- Hot deck
- if low variance data with few missing values (ex. categorical): mode (but introduces bias)

## univariate analysis

only one variable

check for:
- mean, median, mode
- standard deviation
- kurtosis (asymmetry)
- modality (no. peaks)


### univariate visualization

categorical:
- bar chart, frequency distribution
    - http://mlwiki.org/index.php/Bar_Chart#Bivariate_Analysis
    - stacked
    - proportional stacked
    - side-by-side

continuous:
- histogram, kernel density estimation
    - http://mlwiki.org/index.php/Histogram#Bivariate_Analysis
    - plot kde's for different categories different at same time

In [None]:
# categorical:
column = 'Embarked'
print('mode: ', list(df[column].mode()))

print(df[column].value_counts(normalize=True, dropna=False))

sns.countplot(df[column])
plt.suptitle('bar chart')

In [None]:
# continuous:
column = 'Age'
print(df[column].describe())
print('median:', df[column].median())
print('mode(s):', list(df[column].mode()))
print('kurtosis:',df[column].kurtosis())

sns.distplot(df[column].dropna())
plt.suptitle('Histogram with Kernel Density Estimation')

## multivariate analysis

### multivariate visualization

- infer correlations between two or more variables
- dependent variable y

continuous vs. continuous:
- scatterplot (+ boxplot/violin plot on axes)

continuous vs. categorical:
- bivariate bar graph
- boxplot
    - http://mlwiki.org/index.php/Box_Plot#Bivariate_Analysis
    - is distribution compact? symmetric?
    - any outliers?
    - can also accompany axes of scatterplot
- violin plot

categorical vs. categorical:
- contingency tables

In [None]:
# cont vs. cont:
sns.regplot(x='Age', y='Fare', data=df)
# no real correlation between age and fare

In [None]:
sns.jointplot('Age', 'Fare', df, kind='reg')

In [None]:
sns.pairplot(df.dropna(), vars=['Age', 'Fare'], hue='Survived', kind='reg', diag_kind='hist')

In [None]:
# cat vs. cont:
# data type categorical has to be set correctly
# add additional columns by hue
sns.factorplot(x='Fare', y='Survived', data=df, kind='bar')

In [None]:
#boxplot
sns.factorplot(x='Fare', y='Survived', data=df, kind='box')

In [None]:
# add additional information in violin plot by split and hue
sns.factorplot(x='Fare', y='Survived', data=df, kind='violin')

In [None]:
# very complex example:
sns.factorplot(x='Survived', y='Age', hue='Sex', col='Pclass', data=df, kind='violin', split=True)

In [None]:
# cat vs. cat:
sns.heatmap(pd.crosstab(df['Survived'], df['Pclass'], margins=True), square=True, annot=True, fmt='d', linewidth=5)

# inferential statistics
-> hypothesis testing:

- $H_0$: no relationship between x and y
- $H_1$: there is a relationship

p-value:
    -5%: 25-50% false positive rate (depending on plausibility of $H_1$)
    -~0.27%: 5% false positive rate

distinguish between parametric (mostly assume normal distribution) and non-parametric tests (no assumptions, weaker). test for normality with QQ-plot
- Q(y) ~ Q(x): (quantitative response y vs quantitative explanatory x) use correlation coefficient
    - ranked data: Goodman-Kruskal $\Gamma$, Spearman Rank Correlation
- Q(y) ~ C(x): 
    - ANOVA F-test: measure whether there is a significant variance of the mean of different groups
- C(y) ~ Q(x):
- C(y) ~ C(x): $\chi^2$-test (normalized: Cramer's V), Fisher odds ratio, Goodman-Kruskal $\tau$

for categorical variables with more than two levels: perform post-hoc tests (do $\chi^2$ for each group individually). other tests:
- tuckey hsd test
- Holm T
- Least Significant Difference

In [None]:
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
import statsmodels.api as sm

In [None]:
df.count()

In [None]:
# predict 
test1 = smf.ols(formula='Survived ~ C(Pclass)', data=df).fit()
print(test1.summary())
# R-squared: percentage of variance in data explained by model
# F-statistic: =1: group means are equal
#              >1: group means differ
# Prob(F-statistic): p-value of test

In [None]:
# post-hoc test of groups
tuckey1 = multi.MultiComparison(df['Fare'], df['Pclass'])
res1 = tuckey1.tukeyhsd()
print(res1.summary())

In [None]:
# chi^2 test
df_cross = pd.crosstab(df['Survived'], df['Pclass'])
import scipy as sp
print('chi2={}, p-val={}, DoF={}, expected counts=\n{}'.format(*sp.stats.chi2_contingency(df_cross)))