In [1]:
# Dependencies
import pandas as pd

# Choosing iris data

In [2]:
# Load data
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Drop "class" column, does not compute in unsupervised learning
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
# Export the data
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

# Preprocessing shopping data

## Questions for Data Preparation
### Before we begin, consider these questions:
- What knowledge do we hope to glean from running an unsupervised learning model on this dataset?
- What data is available? What type? What is missing? What can be removed?
- Is the data in a format that can be passed into an unsupervised learning model?
- Can I quickly hand off this data for others to use?


In [5]:
# Load another data set
file_path = "Resources/shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [6]:
# Find the length
len(df_shopping)

203

In [7]:
# Columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [8]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

## Remove Null Values

In [9]:
# Find null values
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()}null values.")

Column CustomerID has 0null values.
Column Card Member has 2null values.
Column Age has 2null values.
Column Annual Income has 0null values.
Column Spending Score (1-100) has 1null values.


In [10]:
# Drop null rows
df_shopping = df_shopping.dropna()

In [11]:
# Find the length of the dataframe
# Why isn't this number lower than 200? (Because there were only 3 total rows with null values, dummy.)
len(df_shopping)

200

## Removing unnecessary data and converting data

In [12]:
# Find duplicate entries
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [13]:
# Remove the CustomerID column
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [14]:
# Transform string column; convert Card Member Yes=1, 0=No
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [15]:
# Transform Annual Income by scale: divide by 1000
df_shopping["Annual Income"] = df_shopping["Annual Income"]/1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [18]:
list(df_shopping)

['Card Member', 'Age', 'Annual Income', 'Spending Score (1-100)']

In [19]:
# Reformat column names to remove spaces and numbers
df_shopping.rename(columns = {"Card Member":"CardMember", "Age":"Age", "Annual Income":"AnnualIncome", "Spending Score (1-100)":"SpendingScore"})


Unnamed: 0,CardMember,Age,AnnualIncome,SpendingScore
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
...,...,...,...,...
198,0,35.0,120.0,79.0
199,0,45.0,126.0,28.0
200,1,32.0,126.0,74.0
201,1,32.0,137.0,18.0


## Saving data as readable format

In [20]:
# Saving cleaned data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)