# Explore Statistics by Data Visualization

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

We will use a small dataset that contains (Physics,Biology and Maths) marks of a classroom of students.

In [3]:
df = pd.read_csv("data/grades.csv")

Show the first 5 rows of data.

In [4]:
df.head()

Unnamed: 0,Name,Physics,Biology,Maths
0,Arnold,80,78,70
1,Bob,60,98,45
2,Cassandra,34,56,65
3,Donovan,56,65,32
4,Emily,98,23,55


Show all the data entries.

In [5]:
df

Unnamed: 0,Name,Physics,Biology,Maths
0,Arnold,80,78,70
1,Bob,60,98,45
2,Cassandra,34,56,65
3,Donovan,56,65,32
4,Emily,98,23,55
5,Fabian,45,44,60
6,Grom,12,66,80
7,Hilary,60,88,78
8,Ivan,45,97,99
9,John,98,80,84


Describe the dataset with mean, standard deviation, data entries count and etc.

Show only the data column that you want.

In [6]:
df["Biology"].head()

0    78
1    98
2    56
3    65
4    23
Name: Biology, dtype: int64

In [7]:
df.describe()

Unnamed: 0,Physics,Biology,Maths
count,26.0,26.0,26.0
mean,63.769231,67.038462,69.769231
std,22.742573,21.371908,17.339683
min,12.0,20.0,32.0
25%,46.25,56.0,60.0
50%,63.0,66.0,68.5
75%,79.5,78.0,83.5
max,98.0,98.0,99.0


Show available columns of data.

In [None]:
df.columns.values

Plot a **bar chart** of the grades data.

In [None]:
df.plot(kind="bar")

Plot a **box plot** of the grades data.

In [None]:
df.boxplot()

Plot the **histograms** of the grades data.

In [None]:
df.hist()

Plot the histogram of "Physics" column.

In [None]:
df["Physics"].hist()

We can plot a distribution plot by using **seaborn** module.

In [None]:
splot=sns.distplot(df['Physics'])
# splot.set_xlim(0, 100)

In [None]:
df["Physics"].skew()

In many Machine Learning modeling scenarios, **normality** of the features in a dataset is desirable. Power transforms are a family of parametric, monotonic transformations that aim to map data from any distribution to as close to a **Gaussian distribution** as possible in order to stabilize variance and **minimize skewness**.

In [None]:
scaler = preprocessing.PowerTransformer(method='box-cox', standardize=False)

In [None]:
physics_scaled=scaler.fit_transform(physics_2d)

In [None]:
physics_scaled

In [None]:
df_new=pd.DataFrame(physics_scaled)
df_new.hist()

In [None]:
splot=sns.distplot(df_new)

In [None]:
df["Physics"].skew()

In [None]:
df_new.skew()

In [None]:
np.random.randint(low=1, high=100, size=4)

In [None]:
np.random.random_sample()

In [None]:
from scipy.stats import skewnorm

In [None]:
rand_vars = skewnorm.rvs(5, size=10000)

In [None]:
fig, ax = plt.subplots(1, 1)
# ax.hist(rand_vars, density=True, histtype='stepfilled')
ax.hist(rand_vars)
plt.show()

In [None]:
df=pd.DataFrame(rand_vars)

In [None]:
df.skew()

In [None]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)

In [None]:
data_trans=transformer.transform(df)

In [None]:
df_new=pd.DataFrame(data_trans)

In [None]:
fig, ax = plt.subplots(1, 1)
ax.hist(data_trans)
plt.show()

In [None]:
df_new.skew()

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

In [None]:
df_scaled=scaler.fit_transform(df)

In [None]:
df_new=pd.DataFrame(df_scaled)

In [None]:
df_new.skew()

In [None]:
scaler = preprocessing.PowerTransformer(standardize=False)

In [None]:
df_scaled=scaler.fit_transform(df)

In [None]:
df_new=pd.DataFrame(df_scaled)
df_new.skew()

In [None]:
fig, ax = plt.subplots(1, 1)
ax.hist(df_scaled, density=True, histtype='stepfilled')
plt.show()