# *Modern Deep Learning for Tabular Data*, Appendix

This notebook contains the complementary code discussed in the appendix of *Modern Deep Learning for Tabular Data*.

External Kaggle links to datasets used in this notebook:
- None

You can download these datasets from Kaggle, or import these notebooks into Kaggle and connect them internally.

---

## Imports

In [None]:
# data management
import numpy as np                   # for linear algebra
import pandas as pd                  # for tabular data manipulation and processing

# machine learning
import sklearn                       # for data prep and classical ML
import tensorflow as tf              # for deep learning
from tensorflow import keras         # for deep learning
import keras.layers as L             # for easy NN layer access

# data visualization and graphics
import matplotlib.pyplot as plt      # for visualization fundamentals
import seaborn as sns                # for pretty visualizations
import cv2                           # for image manipulation

# misc
from tqdm.notebook import tqdm       # for progress bars
import math                          # for calculation
import sys                           # for system manipulation
import os                            # for file manipulation

---

## NumPy Arrays

Developing a visual representation of arrays and indexing:

In [None]:
arr = np.arange(100).reshape((10, 10))

plt.figure(figsize=(5,4), dpi=400)
ax = sns.heatmap(arr, annot=True, vmin=0, vmax=99)
plt.yticks(rotation=0)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()

plt.figure(figsize=(5,2), dpi=400)
ax = sns.heatmap(arr[0:5], annot=True, vmin=0, vmax=99)
plt.yticks(rotation=0)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()

plt.figure(figsize=(5,2), dpi=400)
ax = sns.heatmap(arr[5:10], annot=True, vmin=0, vmax=99,
                 yticklabels=[5, 6, 7, 8, 9])
plt.yticks(rotation=0)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()

plt.figure(figsize=(2.5,4), dpi=400)
ax = sns.heatmap(arr[:,0:5], annot=True, vmin=0, vmax=99)
plt.yticks(rotation=0)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()

plt.figure(figsize=(2.5,4), dpi=400)
ax = sns.heatmap(arr[:,5:10], annot=True, vmin=0, vmax=99,
                 xticklabels=[5, 6, 7, 8, 9])
plt.yticks(rotation=0)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()

plt.figure(figsize=(2.5,2), dpi=400)
ax = sns.heatmap(arr[0:5,0:5], annot=True, vmin=0, vmax=99)
plt.yticks(rotation=0)
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')
plt.show()

Try this exercise out! What is the shape of the following index?

In [None]:
arr = np.zeros((5, 5, 5, 5, 5))
# arr[:, 0, 3:, 1:2, 2:4].shape

Reference semantics - don't make mistakes modifying array references!

In [None]:
print('\nWithout Copying')

arr = np.arange(10)
copy = arr
arr[0] = 10

print(copy)
print(arr)

print('\nWith Copying')

arr = np.arange(10)
copy = np.copy(arr)
arr[0] = 10

print(copy)
print(arr)

Array casting:

In [None]:
arr1 = np.array([1,2,3])
arr2 = arr1.astype(np.uint8)

print(f'{arr1}, {arr1.dtype}')
print(f'{arr2}, {arr2.dtype}')

Function application and vectorization:

In [None]:
def f(x):
    if x < 0: return x**2/25
    else: return np.sin(x) * x**2

inputs = np.linspace(-5, 5, 100)
outputs = np.vectorize(f)(inputs)
print(outputs)

In [None]:
def f(x, y, z):
    if x + y + z > 10: return True
    return False

x = np.arange(0, 5)
y = np.arange(7, 2, -1)
z = np.arange(-1, 9, 2)
np.vectorize(f)(x, y, z)

Example: image manipulation.

In [None]:
url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/2b/NYC_Downtown_Manhattan_Skyline_seen_from_Paulus_Hook_2019-12-20_IMG_7347_FRD_%28cropped%29.jpg/1920px-NYC_Downtown_Manhattan_Skyline_seen_from_Paulus_Hook_2019-12-20_IMG_7347_FRD_%28cropped%29.jpg'
image = io.imread(url)

print('Original Image')

plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(image)
plt.show()



print('\n\n Displaying the color components of the image')

for i, color in enumerate(['Reds', 'Blues', 'Greens']):
    plt.figure(figsize=(10, 5), dpi=400)
    plt.imshow(image[:,:,i], cmap=color)
plt.show()



print('\n\nMean grayscale representation')

plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(np.mean(image, axis=2), cmap='gray')
plt.show()



print('\n\nMedian grayscale representation')

plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(np.median(image, axis=2), cmap='gray')
plt.show()



print('\n\nMax grayscale representation')

plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(np.max(image, axis=2), cmap='gray')
plt.show()



print('\n\nMin grayscale representation')

plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(np.min(image, axis=2), cmap='gray')
plt.show()



noise_vector = np.random.normal(0, 40, (770, 1920, 3))
altered_image = image + noise_vector
display_image = altered_image.astype(np.uint8)

print('\n\nRandomly distributed noise vector')
plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(noise_vector, cmap='gray')
plt.show()

print('\n\nRandom noise applied to image')
plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(display_image, cmap='gray')
plt.show()

noise_vector = np.random.normal(100, 40, (770, 1920, 3))
altered_image = image + noise_vector
display_image = altered_image.astype(np.uint8)

print('\n\nRandom noise w/ mean 100 applied to image')
plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(display_image, cmap='gray')
plt.show()

noise_vector = np.random.normal(200, 40, (770, 1920, 3))
altered_image = image + noise_vector
display_image = altered_image.astype(np.uint8)

print('\n\nRandom noise w/ mean 200 applied to image')
plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(display_image, cmap='gray')
plt.show()



print('\n\nVarious levels of image contrast')

for factor in [0.2, 0.6, 1.5, 3, 8]:
    altered_image = image * factor
    display_image = altered_image.astype(np.uint8)
    plt.figure(figsize=(10, 5), dpi=400)
    plt.imshow(display_image)
plt.show()



print('\n\nOriginal picture of beasts battling')
url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/KK_v_G_trailer_%281962%29.png/440px-KK_v_G_trailer_%281962%29.png'
beasts = io.imread(url)
plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(beasts)
plt.show()



print('\n\nMerged image of beasts battling and the New York City skyline')
merged = cv2.resize(image, (beasts.shape[1], beasts.shape[0])) & beasts
plt.figure(figsize=(10, 5), dpi=400)
plt.imshow(merged)
plt.show()


---

## Pandas DataFrames

### Constructing Pandas DataFrames

In [None]:
df = pd.DataFrame()
df['a'] = [1, 2, 3]
df['b'] = [4, 5, 6]
df['c'] = [7, 8, 9]
df

In [None]:
df = pd.DataFrame()
df['a'] = [1, 2, 3]
df['b'] = [4, 5, 6]
df['c'] = [7, 8, 9]
df

### Simple Pandas Mechanics

In [None]:
def makeTable(n = 10):
    table = pd.DataFrame(index=range(1, n+1),
                         columns=range(1, n+1))
    for num1 in table.columns:
        for num2 in table.index:
            table[num1][num2] = num1 * num2
    return table

table = makeTable(n=100)
table

In [None]:
table[[5, 10, 15]]

In [None]:
table.loc[[5, 10, 15]]

In [None]:
table.loc[[5, 10, 15], [5, 10, 15]]

In [None]:
table.loc[[5, 10, 15], [5, 10, 15]].reset_index()

In [None]:
table.loc[[5, 10, 15], [5, 10, 15]].reset_index(drop=True)

In [None]:
table.loc[90:100, 5:100:3]

In [None]:
newCol = {}
nums = list(range(1, 101))
np.random.shuffle(nums)
for i in range(len(nums)):
    newCol[nums[i]] = nums[(i+1) % 100]
table.rename(columns=newCol)

In [None]:
x = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
y = [sys.getsizeof(makeTable(n=n))/1000 for n in tqdm(x)]

plt.figure(figsize=(10, 5), dpi=400)
plt.plot(x, y, color='black')
plt.grid()
plt.xlabel('$n$')
plt.ylabel('KB')
plt.title('Storage Size for Pandas $n \cdot n$ Multiplication DataFrames')
plt.show()

In [None]:
def makeHalfTable(n = 10):
    table = pd.DataFrame(index=range(1, n+1),
                         columns=range(1, n+1))
    for num1 in table.columns:
        for num2 in table.index[num1-1:]:
            table[num1][num2] = num1 * num2
    return table

makeHalfTable(n=100)

In [None]:
yHalf = [sys.getsizeof(makeHalfTable(n=n))/1000 for n in tqdm(x)]

plt.figure(figsize=(10, 5), dpi=400)
plt.plot(x, yHalf, color='red', linestyle='--', label='Half-Filled with NaN')
plt.plot(x, y, color='blue', label='Default')
plt.grid()
plt.legend()
plt.xlabel('$n$')
plt.ylabel('KB')
plt.title('Storage Size for Pandas $n \cdot n$ Multiplication DataFrames')
plt.show()

In [None]:
def makeHalf0Table(n = 10):
    table = pd.DataFrame(index=range(1, n+1),
                         columns=range(1, n+1))
    for num1 in table.columns:
        for num2 in table.index[num1-1:]:
            table[num1][num2] = num1 * num2
    return table.fillna(0)

yHalf0 = [sys.getsizeof(makeHalf0Table(n=n))/1000 for n in tqdm(x)]

plt.figure(figsize=(10, 5), dpi=400)
plt.plot(x, yHalf, color='red', linestyle='--', label='Half-Filled with NaN')
plt.plot(x, yHalf0, color='green', linestyle='-.', label='Half-Filled with None')
plt.plot(x, y, color='blue', label='Default')
plt.grid()
plt.legend()
plt.xlabel('$n$')
plt.ylabel('KB')
plt.title('Storage Size for Pandas $n \cdot n$ Multiplication DataFrames')
plt.show()

### Advanced Pandas Mechanics

In [None]:
construct_dict = {'foo': ['A']*3 + ['B']*3,
                  'bar': ['I', 'II', 'III']*2,
                  'baz': range(1, 7)}
dummy_df = pd.DataFrame(construct_dict)
dummy_df

Pivoting

In [None]:
mod_dummy_df = dummy_df.copy()
mod_dummy_df.pivot(index='foo', columns='bar', values='baz')

In [None]:
mod_dummy_df = dummy_df.copy()
mod_dummy_df['baz2'] = range(101, 107)
mod_dummy_df.pivot(index='foo', columns='bar', values=['baz','baz2'])

Melting

In [None]:
dummy_df[:3].melt(id_vars=['baz'], value_vars=['foo', 'bar'])

Exploding

In [None]:
dummy_df_explode = dummy_df.copy()[:3]
list_contents = [[1, 2],
                 ['a','b','c'],
                 12]
dummy_df_explode['lists'] = list_contents
dummy_df_explode.explode('lists')

Stacking

In [None]:
pd.DataFrame(dummy_df[['foo', 'bar']].stack())

Unstacking

In [None]:
dummy_df[['foo', 'bar']].stack().unstack(level=-1)

In [None]:
dummy_df[['foo', 'bar']].stack().unstack(level=0)