# Data Cleaning with Python

#### Hosted by Developer Student Club of the University of Florida
#### Tech Leads: Tyler Metzger, Raymond Yu, Sindhu Kandula

In this demo, we will go over some common techniques involving data cleaning. Specifically, we will discuss how to handle missing or faulty entries and apply scaling or normalization on the data.

In [None]:
#Imports necessary libraries
import pandas as pd
import numpy as np

#Read in datasets
nfl_data = pd.read_csv("./NFL Play by Play 2009-2016 (v3).csv")

## Handling Missing Data

In [None]:
#Looks at first 5 rows of dataset
nfl_data.head()

In [None]:
#Get number of missing values for each column
missing_values_count = nfl_data.isnull().sum()
#Counts for first 10 columns
missing_values_count[0:10]

In [None]:
#Calulate percent of data that is missing
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()
percent_missing = (total_missing/total_cells) * 100
print("%f%% of data is null" % (percent_missing))

### Dropping data

In [None]:
nfl_data.dropna()

In [None]:
# remove columns with at least one missing value
columns_with_na_dropped = nfl_data.dropna(axis=1)
columns_with_na_dropped.head()

In [None]:
# Data loss
print("Columns in original dataset: %d \n" % nfl_data.shape[1])
print("Columns without null values: %d" % columns_with_na_dropped.shape[1])

### Imputation

In [None]:
# get a subset
subset_nfl_data = nfl_data.loc[:, 'EPA':'Season'].head()
subset_nfl_data

In [None]:
# replace null values with 0
subset_nfl_data.fillna(0)

## Scaling and Normalization

In [None]:
# for Box-Cox Transformation
from scipy import stats

# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

# set seed for reproducibility
np.random.seed(0)

### Scaling

In [None]:
# generates 1000 data points randomly from an exponential distribution
original_data = np.random.exponential(size=1000)
# print(original_data)

# scale data between 0 and 1
scaled_data = np.interp(original_data, (original_data.min(), original_data.max()), (0, +1))

# plot both graphs
fig, ax = plt.subplots(1,2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")

### Normalization

In [None]:
# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both graphs
fig, ax=plt.subplots(1,2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original Data")
sns.distplot(normalized_data[0], ax=ax[1])
ax[1].set_title("Normalized data")