# Data Transformation using Python Pandas

### Import dependencies

In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
# Display all columns (pandas will collapse some columns if we don't set this option)
pd.set_option('display.max_columns', None)

In [3]:
nationals_df = pd.read_csv("2019_washington_nationals")
astros_df = pd.read_csv("2019_houston_astros")

In [4]:
# Delete non player roles using df index and .drop method
nationals = nationals_df.drop([8,22,52,53,54,55,56])

In [5]:
# Use the info method to inspect the datatypes for each column
nationals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 51
Data columns (total 28 columns):
Rk      50 non-null object
Pos     50 non-null object
Name    50 non-null object
Age     50 non-null object
G       50 non-null object
PA      50 non-null object
AB      50 non-null object
R       50 non-null object
H       50 non-null object
2B      50 non-null object
3B      50 non-null object
HR      50 non-null object
RBI     50 non-null object
SB      50 non-null object
CS      50 non-null object
BB      50 non-null object
SO      50 non-null object
BA      36 non-null object
OBP     36 non-null object
SLG     36 non-null object
OPS     36 non-null object
OPS+    36 non-null object
TB      50 non-null object
GDP     50 non-null object
HBP     50 non-null object
SH      50 non-null object
SF      50 non-null object
IBB     50 non-null object
dtypes: object(28)
memory usage: 11.3+ KB


#### Relabel columns for readibility

In [None]:
# Step 1: Create a dictionary the key is the original name, and the value is the new name
nat_col_names = {"Rk":"Rank", "Pos":"Position","G":"Games_Played", "PA":"Plate_Apperance", "AB":"At_Bat", "R":"Runs", "H":"Hits", "SB":"Stolen_Bases","CS":"Caugt_Stealing","BB":"Base_Balls", "SO":"Strike_Outs","BA":"Hits_at_Bat", "TB":"Total_Bases", "HBP":"Hits_by_Pitch"}

In [None]:
# Step 2: use the rename method and the dictionary name to assign the new column names to the dataframe
nationals = nationals.rename(columns=nat_col_names)

# Memory Reduction

In [None]:
# Inspect current data types for each column using dtypes
nationals.dtypes

#### Find memory usage with memory_usage method

In [None]:
nationals_mem = nationals.memory_usage(deep=True)
nationals_mem

#### Reduce memory usage by changing datatypes

In [None]:
# Convert columns that are categorical using astype method
nationals["Position"] = nationals["Position"].astype('category')

#### Astype method only works on a single column, to change multiple columns in one code block, use a for loop.

#### Step1: Create lists of columns by the final data type

In [None]:
# These columns are categorical and strings
non_num_cols = ['Position','Name']

In [None]:
# These columns have missin values, and decimals
float_cols = ['Hits_at_Bat','OBP','SLG','OPS','OPS+']

In [None]:
# Combine all non integer columns into a single list
non_int_cols = non_num_cols + float_cols

In [None]:
# Use a list comprehension to generate a list of columns that contains integer values
# int_cols = ["Age","Base_Balls", "Games_Played", "Plate_Apperance","At_Bat","Runs","Hits","2B","Strike_Outs"]
int_cols = [col for col in nationals.columns if col not in non_int_cols]

#### Step2: Use a for loop to change the data types of multiple columns

In [None]:
# Convert into integer
for col in int_cols:
    nationals[col] = nationals[col].astype('int64')

In [None]:
# Converts columns into floats
for col in float_cols:
    nationals[col] = nationals[col].astype('float')

In [None]:
# Confirm datatype changes
nationals.dtypes

In [None]:
updated_nationals_mem = nationals.memory_usage(deep=True)

In [None]:
mem_df = pd.concat([nationals_mem, updated_nationals_mem], axis=1)

In [None]:
mem_df.columns = ['Pre Change', 'Post Change']

In [None]:
mem_df['Absolute Change'] = mem_df['Post Change'] - mem_df['Pre Change']
mem_df["Percent Change"] = ((mem_df['Post Change'] - mem_df['Pre Change'])/mem_df['Pre Change'])

In [None]:
format_dict = {'Percent Change':'{:.2%}'}
mem_df.style.format(format_dict).

In [None]:
def color_negative_red(val):
    color = 'red' if val < 0 else 'black'
    return 'color: %s' % color

mem_df.style.applymap(color_negative_red)

In [None]:
mem_df

#### Data to plot

In [None]:
# Labels: Prepare a list of column name labels
labels = list(nationals.columns.values)

# Data values
post_memory = list(updated_nationals_mem.values)
post_memory = post_memory[1:] #omits the index value because the labels list does not include "index"
pre_memory = list(nationals_mem.values)
pre_memory = pre_memory[1:] #omits the index value because the labels list does not include "index"

#### Prepare plots

In [None]:
# the label locations
x = np.arange(len(labels))

# the width of the bars
width = 0.35  

In [None]:
# Confirm that the number of labels and values are equal
len(labels) == len(pre_memory)
print(len(labels), len(post_memory))

In [None]:
# Creates just a figure and only one subplot
fig, ax = plt.subplots()
# Code for the bars, one per group
rects1 = ax.bar(x - width/2, pre_memory, width, label='Pre Dtype Change') 
rects2 = ax.bar(x + width/2, post_memory, width, label='Post Dtype Change')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Memory (bytes)')
ax.set_title('Impact of Data Types on Memory')

# To set the legend box outside of the plot
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
# bboxbbox_to_anchor anchors legend box, bboxbbox_to_anchor(x, y)
ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1), shadow=False, ncol=1)

fig.tight_layout()
plt.show()