In [2]:
# Import libraries
import pandas as pd

In [4]:
file_path = "Resources/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Dropping column class as unsupervised learning only deals with numerical data
new_iris_df = iris_df.drop(["class"], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
#Reorder columns
new_df = ["sepal_length", "petal_length", "sepal_width","petal_width"]
new_iris_df = new_iris_df[new_df]
new_iris_df

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
...,...,...,...,...
145,6.7,5.2,3.0,2.3
146,6.3,5.0,2.5,1.9
147,6.5,5.2,3.0,2.0
148,6.2,5.4,3.4,2.3


In [54]:
#Download as a new csv
output_file_path = "Resources/new_iris_data.csv"
new_iris_df.to_csv(output_file_path, index=False)

## Preprocessing Data

In [55]:
file_path = "Resources/shopping_data.csv"
df_shopping = pd.read_csv(file_path)
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


## Data Selection and Processing

Before we begin, consider these questions:

What knowledge do we hope to glean from running an unsupervised learning model on this dataset?

What data is available? What type? What is missing? What can be removed?

Is the data in a format that can be passed into an unsupervised learning model?

Can I quickly hand off this data for others to use?

In [56]:
# What data is available?
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [57]:
# What type? ~~ seems card member isn't numerical
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [58]:
# What is missing? ~~ column in brackets matter the most in getting the answer
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [59]:
# What can be removed?
df_shopping = df_shopping.dropna()

In [60]:
    # Find duplicate entries
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [61]:
    #Drop column has give no insight
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [62]:
# Is the data in a format that can be passed into an unsupervised learning model?
    # Transforming String Column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
    
df_shopping["Card Member"] = df_shopping["Card Member"].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [63]:
 #Scaling on Income is much larger than the rest
df_shopping["Annual Income"] = df_shopping["Annual Income"]/ 1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [67]:
df_shopping= df_shopping.rename({"Card Member" : "CardMember", "Annual Income" : "AnnualIncome", "Spending Score (1-100)": "SpendingScore"}, axis=1)
df_shopping

In [70]:
# Can I quickly hand off this data for others to use?
    # Saving cleaned data
file_path = "Resources/shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)