In [1]:
import pandas as pd
import os

In [2]:
# load in the csv
file_path = os.path.join("resources", "iris.csv")
iris_df = pd.read_csv(file_path)
display(iris_df.head(1))

# drop class because it is not numeric data
iris_df = iris_df.drop(["class"], axis=1)
display(iris_df.head(1))

# reroder the columns
new_order = [
    "sepal_length",
    "petal_length",
    "sepal_width",
    "petal_width"
]
iris_df = iris_df[new_order]
display(iris_df.head(1))

# export the modified csv
output_file_path = os.path.join("resources", "iris_cleaned.csv")
iris_df.to_csv(output_file_path, index=False)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2


Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2


In [3]:
# load in the csv
file_path = os.path.join("resources", "shopping_data.csv")
shopping_df = pd.read_csv(file_path)
display(shopping_df.head(1))

# print columns
print(shopping_df.columns)

# print data types
print(shopping_df.dtypes)

# find the number of nulls in each column
for column in shopping_df.columns:
    print(f"Column {column} has {shopping_df[column].isnull().sum()} null values")
    
# very few null values so we are just going to drop those and move on
print("Number of Rows Before Dropna: ",len(shopping_df))
shopping_df = shopping_df.dropna()
print("Number of Rows After Dropna: ",len(shopping_df))

# check for duplicate rows
print(f"Duplicate entries: {shopping_df.duplicated().sum()}")

# now remove noisy columns, customer id has no influence on customer behavior so drop it
shopping_df.drop(columns=["CustomerID"], inplace=True)
display(shopping_df.head(1))

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0


Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')
CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object
Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values
Number of Rows Before Dropna:  203
Number of Rows After Dropna:  200
Duplicate entries: 0


Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0


In [4]:
# all value need to be numeric so we need to encode yes and no to 1 and 0
def change_string(member):
    if member == "yes":
        return 1
    else:
        return 0
    
shopping_df["Card Member"] = shopping_df["Card Member"].apply(change_string)
display(shopping_df.head(1))

# to avoid shewing we need to rescale "Annual Income" because it is three orders of magnitude greater than the othe values
shopping_df["Annual Income"] = shopping_df["Annual Income"]/1000
display(shopping_df.head(1))

# rename the columns to remove spaces and numbers
column_names = {
    "Card Member": "card_member",
    "Age": "age",
    "Annual Income": "annual_income",
    "Spending Score (1-100)": "spending_score"
}

shopping_df = shopping_df.rename(columns=column_names)
display(shopping_df.head(1))

# Saving cleaned data
# as a csv
file_path = os.path.join("resources", "shopping_data_cleaned.csv")
shopping_df.to_csv(file_path, index=False)

# as an excel
file_path = os.path.join("resources", "shopping_data_cleaned.xlsx")
shopping_df.to_excel(file_path, index=False)

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,0,19.0,15000,39.0


Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,0,19.0,15.0,39.0


Unnamed: 0,card_member,age,annual_income,spending_score
0,0,19.0,15.0,39.0


In [5]:
def mean(array):
    
    average = sum(array)/len(array)
    
    return average

x = [8,5,7,4]
y = [1,9,3,7]

print(f"({mean(x)}, {mean(y)})")

(6.0, 5.0)
