https://towardsdatascience.com/a-starter-pack-to-exploratory-data-analysis-with-python-pandas-seaborn-and-scikit-learn-a77889485baf

# A Starter Pack to Exploratory Data Analysis with Python, pandas, seaborn, and scikit-learn

## 1. What is Data
https://drive.mindmup.com/map/1ibxiwNDkPfGwMRZ0utIjBIoyQoVtMD2m

## 2. Categorical Analysis

In [None]:
%ls

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv("data/titanic/train.csv")
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
def categorical_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', verbose=True):
    '''
    Helper function that gives a quick summary of a given column of categorical data

    Arguments
    =========
    dataframe: pandas dataframe
    x: str. horizontal axis to plot the labels of categorical data, y would be the count
    y: str. vertical axis to plot the labels of categorical data, x would be the count
    hue: str. if you want to compare it another variable (usually the target variable)
    palette: array-like. Colour of the plot

    Returns
    =======
    Quick Stats of the data and also the count plot
    '''
    if x == None:
        column_interested = y
    else:
        column_interested = x
    series = dataframe[column_interested]
    print(series.describe())
    print('mode: ', series.mode())
    if verbose:
        print('='*80)
        print(series.value_counts())

    sns.countplot(x=x, y=y, hue=hue, data=dataframe, palette=palette)
    plt.show()

In [None]:
# Target Variable: Survival
c_palette = ['tab:blue', 'tab:orange']
categorical_summarized(train_df, y = 'Survived', palette=c_palette)

In [None]:
# Feature Variable: Gender
categorical_summarized(train_df, y = 'Sex', hue='Survived', palette=c_palette)

In [None]:
def quantitative_summarized(dataframe, x=None, y=None, hue=None, palette='Set1', ax=None, verbose=True, swarm=False):
    '''
    Helper function that gives a quick summary of quantattive data
    Arguments
    =========
    dataframe: pandas dataframe
    x: str. horizontal axis to plot the labels of categorical data (usually the target variable)
    y: str. vertical axis to plot the quantitative data
    hue: str. if you want to compare it another categorical variable (usually the target variable if x is another variable)
    palette: array-like. Colour of the plot
    swarm: if swarm is set to True, a swarm plot would be overlayed
    Returns
    =======
    Quick Stats of the data and also the box plot of the distribution
    '''
    series = dataframe[y]
    print(series.describe())
    print('mode: ', series.mode())
    if verbose:
        print('='*80)
        print(series.value_counts())

    sns.boxplot(x=x, y=y, hue=hue, data=dataframe, palette=palette, ax=ax)

    if swarm:
        sns.swarmplot(x=x, y=y, hue=hue, data=dataframe,
                      palette=palette, ax=ax)
        
    plt.show()

In [None]:
# univariate analysis
quantitative_summarized(dataframe= train_df, y = 'Age', palette=c_palette, verbose=False, swarm=True)

In [None]:
# bivariate analysis with target variable
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Survived', palette=c_palette, verbose=False, swarm=True)

In [None]:
# multivariate analysis with Embarked variable and Pclass variable
quantitative_summarized(dataframe= train_df, y = 'Age', x = 'Embarked', hue = 'Pclass', verbose=False, swarm=False)

In [None]:
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [None]:
def simple_preprocessing(dataframe, train=True):
    le = LabelEncoder()
    X = dataframe.drop(['PassengerId', 'Cabin', 'Name', 'Ticket'], axis=1)
    X['Age'] = X['Age'].fillna(value=X['Age'].mode()[0])
    X['Embarked'] = le.fit_transform(X['Embarked'].fillna(value=X['Embarked'].mode()[0]))
    X['Sex'] = np.where(X['Sex'] == 'male', 1, 0)
    
    if train:
        X = X.drop(['Survived'], axis=1)
        y = np.where(dataframe['Survived'] == 1, 'Alive', 'Dead')
        y = pd.get_dummies(y, columns=['Survived'])
        return X, y
    else:
        return X

In [None]:
samples = simple_preprocessing(train_df, train = False)

In [None]:
samples.head()

In [None]:
# inertias plot
# good k is a low inertia (how tight the data is in a cluster) and not too high

# Select Scaler
scaler = preprocessing.MinMaxScaler()
# scaler = preprocessing.RobustScaler()
# scaler = preprocessing.StandardScaler()

samples = scaler.fit_transform(samples)
ks = range(1,10)
inertias = []

for k in ks:
    k_mean = KMeans(n_clusters=k)
    k_mean.fit(samples)
    inertias.append(k_mean.inertia_)

plt.plot(ks, inertias, '-o', c = 'b')
# plt.title('Inertia Plot')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.xticks(ks)
print('Inertial of clusters: ', k_mean.inertia_)
plt.show()

In [None]:
samples = simple_preprocessing(train_df, train = False)
# Select Scaler
scaler = preprocessing.MinMaxScaler()
# scaler = preprocessing.RobustScaler()
# scaler = preprocessing.StandardScaler()

k_mean = KMeans(n_clusters=2)

sample_train,sample_val, gt_train, gt_val = train_test_split(samples, 
                                                                    train_df['Survived'],
                                                             test_size=0.2, random_state=99)

pipeline = make_pipeline(scaler, k_mean)
# labels = k_mean.predict(samples)
# centroids = k_mean.cluster_centers_
# centroids_x = centroids[:,0]
# centroids_y = centroids[:,1]
pipeline.fit(sample_train)
labels = pipeline.predict(sample_val)
print('Inertial of clusters: ', k_mean.inertia_)

In [None]:
ct_df = pd.DataFrame({'labels': labels, 'survived': gt_val})
# create crosstb
ct = pd.crosstab(ct_df['labels'], ct_df['survived'])
print(ct)

In [None]:
import matplotlib as mpl
fig = plt.figure(figsize = (8,10))
mpl.rcParams['image.cmap'] = 'jet'
labels = pipeline.predict(sample_train)
x_label = 'Survived'
y_label = 'Age'

plt.scatter(sample_train[x_label], sample_train[y_label], c = labels, alpha = 0.3)
plt.xlabel(x_label)
plt.xticks(sample_train[x_label])
plt.ylabel(y_label)
plt.show()

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram

In [None]:
sample_train,sample_val, gt_train, gt_val = train_test_split(
    train_df, 
    train_df['Survived'],
    test_size=0.05, 
    random_state=99
)

In [None]:
sample_val_processed = simple_preprocessing(sample_val, train = False)
sample_val_processed = scaler.fit_transform(sample_val_processed)

In [None]:
mergings = linkage(sample_val_processed, method='complete')

In [None]:
fig = plt.figure(figsize = (16,10))
dendrogram(mergings,
           labels=np.array(sample_val['Name']),
           leaf_rotation=90,
           leaf_font_size=10)
plt.show()