In [1]:
# Import Libraries
import pandas as pd

In [2]:
# Read the file into a DataFrame
file_path = "data/iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Dataset information
https://archive.ics.uci.edu/ml/datasets/iris

In [3]:
# Let's drop the class column
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
# Reorder the columns so the sepal and petal lengths are the first two columns and the widths are the last two columns.
new_iris_df = new_iris_df.reindex(columns= ['sepal_width','petal_width','sepal_length','petal_length'])
new_iris_df.head()

Unnamed: 0,sepal_width,petal_width,sepal_length,petal_length
0,3.5,0.2,5.1,1.4
1,3.0,0.2,4.9,1.4
2,3.2,0.2,4.7,1.3
3,3.1,0.2,4.6,1.5
4,3.6,0.2,5.0,1.4


In [5]:
# Save the processed to a csv file
output_file_path = 'data/new_iris_data.csv'
new_iris_df.to_csv(output_file_path, index=False)

In [6]:
# Load new shopping data file
file_path = 'data/shopping_data.csv'
shopping_df = pd.read_csv(file_path, encoding='ISO-8859-1')
shopping_df.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


![image-2.png](attachment:image-2.png)

### What data is available?

In [7]:
# Columns
shopping_df.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

### What type of data is available?

In [8]:
shopping_df.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

### What data is missing?

#### How to handle missing data
https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4

In [9]:
# Check how many rows we have in the dataset
shopping_df.shape

(203, 5)

In [10]:
# Check for Null values
for column in shopping_df.columns:
    print(f'Coulumn {column} has {shopping_df[column].isnull().sum()} null values')

Coulumn CustomerID has 0 null values
Coulumn Card Member has 2 null values
Coulumn Age has 2 null values
Coulumn Annual Income has 0 null values
Coulumn Spending Score (1-100) has 1 null values


### What data can be removed?

In [11]:
# According to our count, there are only a few rows with null data so we will remove them
shopping_df = shopping_df.dropna()

In [12]:
# Check how many rows we have in the dataset after removing null values
shopping_df.shape

(200, 5)

In [13]:
# Check for Null values again
for column in shopping_df.columns:
    print(f'Coulumn {column} has {shopping_df[column].isnull().sum()} null values')

Coulumn CustomerID has 0 null values
Coulumn Card Member has 0 null values
Coulumn Age has 0 null values
Coulumn Annual Income has 0 null values
Coulumn Spending Score (1-100) has 0 null values


In [14]:
# Now, lets check for duplicate entries
print(f'Duplicate entries: {shopping_df.duplicated().sum()}')

Duplicate entries: 0


In [15]:
# The CustomerID does not offer any insight into customer shopping habits, lets remove it
shopping_df.drop(columns=['CustomerID'], inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


![image.png](attachment:image.png)

##### Let's return again to our list of questions.
### Is the data in a format that can be passed into an unsupervised learning model?

In [16]:
# Lets convert the Card Member column to a numeric value
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
    
shopping_df["Card Member"] = shopping_df["Card Member"].apply(change_string)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [17]:
# Scale the annual income as it is much larger than all other values in the dataset
# we'll dive them by 1,000

shopping_df["Annual Income"] = shopping_df["Annual Income"]/1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [18]:
# Lets rename those columns with space and/or numbers
shopping_df = shopping_df.rename(columns={'Annual Income': 'AnnualIncome', 'Spending Score (1-100)': 'SpendingScore'})
shopping_df.head()

Unnamed: 0,Card Member,Age,AnnualIncome,SpendingScore
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


##### Let's return once more to our list of questions.
### Can I quickly hand off this data for others to use?

In [19]:
# Now lets saved the trasformed/cleaned data into a new CSV file for future use
output_file_path = 'data/shopping_data_cleaned.csv'
shopping_df.to_csv(output_file_path, index=False)