# 1. Sourcing and loading
## 1a. Import Packages

In [2]:
import pandas as pd
import numpy as np
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO
from IPython.display import Image
import pydotplus

## 1b. Load data (tennis.txt)

In [3]:
# Read in the tennis.txt data to a variable called tennis
tennis = pd.read_csv("https://raw.githubusercontent.com/Ben-Liao/MBA6693-Business-Data-Analysis/main/i01-information-based-learning/data/tennis.txt", sep='\t', header = (0))

# Read in the titanic2.txt data to another variable called titanic, as alternative
#titanic = pd.read_csv("https://raw.githubusercontent.com/Ben-Liao/MBA6693-Business-Data-Analysis/main/i01-information-based-learning/data/titanic2.txt", sep='\t', header = (0))


## 1c. Explore the tennis data

In [4]:
# Snap peek the first 8 rows from the data set.
tennis.head(8)

Unnamed: 0,outlook,temperature,humidity,wind,playtennis
0,sunny hot,high,weak,no,
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cool,normal,weak,yes
5,rain,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no


In [5]:
# Check the size of the data set.
tennis.shape

(14, 5)

The size of the dataset is not large. It contains 14 rows of data and 5 attributes.

In [6]:
# Check the tennis dataset's attributes information.
tennis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   outlook      14 non-null     object
 1   temperature  14 non-null     object
 2   humidity     14 non-null     object
 3   wind         14 non-null     object
 4   playtennis   13 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes


In [7]:
# Get the relevant summary statistics of the tennis data.
tennis.describe(include = 'all')

Unnamed: 0,outlook,temperature,humidity,wind,playtennis
count,14,14,14,14,13
unique,4,4,3,3,2
top,rain,mild,normal,weak,yes
freq,5,6,7,7,9


# 2. Cleaning, transforming and visualizing
## 2a. Cleaning the data

First, update the column header name of `outlook`  to **weather**, and `playtennis` to `play_tennis`. 

In [8]:
# Call .columns on tennis data to check out the names of tennis data's columns
tennis.columns

Index(['outlook', 'temperature', 'humidity', 'wind', 'playtennis'], dtype='object')

In [9]:
# Using .rename(),  to rename the column.
tennis.rename(columns = {"outlook":"weather", "playtennis":"play_tennis"}, inplace = True)

In [10]:
# Check out the column names 
tennis.columns

Index(['weather', 'temperature', 'humidity', 'wind', 'play_tennis'], dtype='object')

In [12]:
#Verify on the weather column, inspect any value need to changed.
tennis['weather'].describe()

count       14
unique       4
top       rain
freq         5
Name: weather, dtype: object

In [13]:
# Call uniqure() on the "weather" column to check its unique value
tennis['weather'].unique()

array(['sunny   hot', 'sunny', 'overcast', 'rain'], dtype=object)

There is a bunch of inconsistency in weather. such 'sunny hot' and 'sunny'. There already a tempeture column to indicate is it hot or not, will replace `sunny hot` to `sunny`.

In [17]:
#User the .replace() function to replace the 'sunny hot' to 'sunny'
tennis["weather"] = tennis["weather"].replace(["sunny   hot"], "sunny");

# Recheck the result on the weather column
tennis['weather'].unique()

array(['sunny', 'overcast', 'rain'], dtype=object)

In [18]:
# Call uniqure() on the "temperature" column to check its unique value
tennis['temperature'].unique()

array(['high', 'hot', 'mild', 'cool'], dtype=object)

We want to replace `high` to `hot` in the temperature column.

In [19]:
#User the .replace() function to replace the 'high' to 'hot'
tennis["temperature"] = tennis["temperature"].replace(["high"], "hot");

# Recheck the result on the weather column
tennis['temperature'].unique()

array(['hot', 'mild', 'cool'], dtype=object)

In [20]:
# Call uniqure() on the "humidity" column to check its unique value
tennis['humidity'].unique()

array(['weak', 'high', 'normal'], dtype=object)

We want to replace the value `weak` to `low`.

In [21]:
#User the .replace() function to replace the 'high' to 'hot'
tennis["humidity"] = tennis["humidity"].replace(["weak"], "low");

# Recheck the result on the weather column
tennis['humidity'].unique()

array(['low', 'high', 'normal'], dtype=object)

In [22]:
# Call uniqure() on the "wind" column to check its unique value
tennis['wind'].unique()

array(['no', 'strong', 'weak'], dtype=object)

The value on `wind` column looks good, nothing need to change or clean.

In [25]:
# Call uniqure() on the "play_tennis" column to check its unique value
tennis['play_tennis'].unique()

array([nan, 'no', 'yes'], dtype=object)

The value on `play_tennis` column looks good, nothing need to change or clean.

In [26]:
# Check the tennis data information
tennis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   weather      14 non-null     object
 1   temperature  14 non-null     object
 2   humidity     14 non-null     object
 3   wind         14 non-null     object
 4   play_tennis  13 non-null     object
dtypes: object(5)
memory usage: 688.0+ bytes
