In [1]:
#Import Dependencies
import pandas as pd

In [2]:
#Data loading
file_path="shopping_data.csv"
df_shopping=pd.read_csv(file_path, encoding="ISO-8859-1")
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
#Show columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
#review datatypes...all columns must contain numerical values
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [7]:
#Review if any data is missing, as not to get 
# ValueError: Input contains NaN, infinity or a value tool arge for dtype('float64')

#Find all null values
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()}null values")

Column CustomerID has 0null values
Column Card Member has 2null values
Column Age has 2null values
Column Annual Income has 0null values
Column Spending Score (1-100) has 1null values


In [8]:
# Remove rows of data with null values with the dropna() method

#Drop null rows
df_shopping=df_shopping.dropna()

In [11]:
# Check for duplicates
print(f"Duplicate entries:{df_shopping.duplicated().sum()}")

Duplicate entries:0


In [12]:
#Remove the customerID column
df_shopping.drop(columns=["CustomerID"], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [14]:
#Transform card member string to value using .apply

#Transform string column
def change_string(member):
    if member=="Yes":
        return 1
    else:
        return 0
    
df_shopping["Card Member"]=df_shopping["Card Member"].apply(change_string)
df_shopping.head()


Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,0,19.0,15000,39.0
1,0,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [15]:
# Adjust scale to annual income
#Transform annual income
df_shopping["Annual Income"]=df_shopping["Annual Income"]/1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,0,19.0,15.0,39.0
1,0,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [16]:
df_shopping.columns

Index(['Card Member', 'Age', 'Annual Income', 'Spending Score (1-100)'], dtype='object')

In [21]:
#Skill Drill 18.2.5
#Reformat the names of the columns so they contain no spaces or numbers.


df_shopping.rename({'Card Member':'Card_Member', 'Age':'Age', 'Annual Income':'Annual_Income', 'Spending Score (1-100)':'Spending_Score'},axis='columns',inplace=True)
df_shopping

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score)
0,0,19.0,15.0,39.0
1,0,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
...,...,...,...,...
198,0,35.0,120.0,79.0
199,0,45.0,126.0,28.0
200,0,32.0,126.0,74.0
201,0,32.0,137.0,18.0


In [25]:
# Saving cleaned data
file_path = "shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)

In [27]:
# Saving cleaned data
file_path = "shopping_data_cleaned.xlsx"
df_shopping.to_excel(file_path, index=False)