# Comparing Distributions

In [None]:
import pandas as pd
import numpy as np
import scipy

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.datasets import load_breast_cancer
#The breast cancer dataset is a classic and very easy binary classification dataset.
dataset = load_breast_cancer() 

#create the dataframe
df = pd.DataFrame(dataset.data)
columns = dataset.feature_names
df.columns = columns
df.head()


In [None]:
mr=df['mean radius']

In [None]:
# We create a boxplot of the first columns called "mean radius"
sns.boxplot(data=df, x='mean radius')

In [None]:
# We create the histogram 
plt.hist(mr)
plt.show()

In [None]:
# and check the main distribution  parameters
mr.describe()

In [None]:
# We scale the data 
df_mr=pd.DataFrame(mr)
#dataset scaling and visualizing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=False)
scaler.fit(pd.DataFrame(df_mr)) 
mr_scaled=scaler.transform(df_mr)

In [None]:
# but we get a array of (single) arrays
mr_scaled[1:5,:]

In [None]:
# so we "flatten" the array
mr_scaled=mr_scaled.flatten()
mr_scaled

In [None]:
#Now is normalized!
plt.hist(mr_scaled)
plt.show()

In [None]:
# In order to check the distribution we can generate probability distributions and fit it to our specific data
# The library scipy.stats provide a set of tools for generate samples with a given theoretical prob. distribution
dist=scipy.stats.gamma
#dist = getattr(scipy.stats, 'norm')
param = dist.fit(mr_scaled)
param

In [None]:
# Clearly the normal that best approximate our data is with mean 0 and std 1 

### Test 1. Uniform distribution

In [None]:
# A list of possible probability distributions
#dist_names = ['beta',
#              'expon',
#              'gamma',
#              'norm',
#              'uniform']

# We test a uniform distribution
dist = getattr(scipy.stats, 'uniform')
param = dist.fit(mr_scaled)
    
# We generate a sample of size  len(mr_scaled) of data distributed according to distribution dist
# The function rvs generates a sample with distribution dist with mean loc and std scale
normal_dist = dist.rvs(*param[0:-2],loc=param[-2], scale=param[-1],size = len(mr_scaled))

In [None]:
def compare_dist(dist_1, dist_2) :
    dist_1.sort()
    # We create the percentiles for both distributions
    percs = np.linspace(0,100,21)
    q_a = np.percentile(dist_1, percs)
    q_b = np.percentile(dist_2, percs)

    # and generate the QQ-plot 
    plt.plot(q_a,q_b, ls="", marker="o")
    plt.title("QQ plot")
    x = np.linspace(np.min((q_a.min(),q_b.min())), np.max((q_a.max(),q_b.max())))
    plt.plot(x,x, color="k", ls="--")
    plt.show()

    plt.hist(dist_1,alpha=.3,density=True)
    plt.hist(dist_2,alpha=.3,density=True)
    plt.show()


In [None]:
compare_dist(mr_scaled,normal_dist)

We see that our distribution have more weight at the beginning of the interval respect to the end

### Test 2. Test Exponential distribution

In [None]:
# We test a exponential distribution
dist = getattr(scipy.stats, 'expon')
param = dist.fit(mr_scaled)
    
# We generate a sample of size  len(mr_scaled) of data distributed according to distribution dist
# The function rvs generates a sample with distribution dist with mean loc and std scale
exp_dist = dist.rvs(*param[0:-2],loc=param[-2], scale=param[-1],size = len(mr_scaled))
exp_dist.sort()

In [None]:
compare_dist(mr_scaled,exp_dist)

In this case our distribution have less weight at the beggining of the interval respect to the reference distribution

### The normal distribution

In [None]:
# We test a normal distribution
dist = getattr(scipy.stats, 'norm')
param = dist.fit(mr_scaled)
    
# We generate a sample of size  len(mr_scaled) of data distributed according to distribution dist
# The function rvs generates a sample with distribution dist with mean loc and std scale
norm_dist = dist.rvs(*param[0:-2],loc=param[-2], scale=param[-1],size = len(mr_scaled))

compare_dist(mr_scaled,norm_dist)


#### Hypothesis test

In [None]:
from scipy import stats

# D'agostino normality test
print(stats.normaltest(mr_scaled))
# Shapiro test of normality
print(stats.shapiro(mr_scaled))

In [None]:
# Kolmogorov-Smirnov Test
print(stats.kstest(mr_scaled, "norm"))
print(stats.kstest(mr_scaled, norm_dist))
# normality tests use a (0,1) normal distribution 

### Test 4. The Gamma distribution 

In [None]:
# We test a exponential distribution
dist = getattr(scipy.stats, 'gamma')
param = dist.fit(mr_scaled)
    
# We generate a sample of size  len(mr_scaled) of data distributed according to distribution dist
# The function rvs generates a sample with distribution dist with mean loc and std scale
gamma_dist = dist.rvs(*param[0:-2],loc=param[-2], scale=param[-1],size = len(mr_scaled))

compare_dist(mr_scaled,gamma_dist)


Finally we see that the Gamma distribution fits our empirical data distribution  

### Hypothesis test

In [None]:
# Kolmogorov-Smirnov Test
#    Test the distribution G(x) against a given distribution F(x).
#    Under the null hypothesis the two distributions are identical, G(x)=F(x).

from scipy import stats
stats.kstest(mr_scaled, gamma_dist)