In [1]:
# import dependencies
import pandas as pd

In [7]:
# import our data and create a dataframe
file_path = ("../cryptocurrencies/shopping_data.csv")

shopping_df = pd.read_csv(file_path, encoding="ISO-8859-1")
shopping_df.head()


Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [11]:
# find all null values
for column in shopping_df.columns:
    print(f"Column {column} has {shopping_df[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [12]:
# drop null values

shopping_df = shopping_df.dropna()

In [13]:
# find duplicate entries
print(f"Duplicate entries: {shopping_df.duplicated().sum()}")

Duplicate entries: 0


In [14]:
# drop columns that do not hold value in the learning model
shopping_df.drop(columns=["CustomerID"], inplace=True)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [16]:
# Transform strings to columns, turn the card member yes to 1 and no to 0
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0
shopping_df["Card Member"] = shopping_df["Card Member"].apply(change_string)
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [17]:
# transform annual income by scaling it into alignment with the other data items
shopping_df["Annual Income"] = shopping_df["Annual Income"]/1000
shopping_df.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [18]:
# reformat the column names to remove spaces and numbers
shopping_df = shopping_df.rename(columns={"Card Member" : "Card_Member", "Annual Income": "Annual_Income", "Spending Score (1-100)": "Spending_Score"})
shopping_df.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [19]:
# Saving cleaned data

file_path2 = ("../cryptocurrencies/shopping_data_cleaned.csv")
shopping_df.to_csv(file_path2, index=False)