In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
data=pd.read_csv('data.csv')
data.head(50)

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually
5,6,46,Male,Sneakers,Footwear,20,Wyoming,M,White,Summer,2.9,Yes,Standard,Yes,Yes,14,Venmo,Weekly
6,7,63,Male,Shirt,Clothing,85,Montana,M,Gray,Fall,3.2,Yes,Free Shipping,Yes,Yes,49,Cash,Quarterly
7,8,27,Male,Shorts,Clothing,34,Louisiana,L,Charcoal,Winter,3.2,Yes,Free Shipping,Yes,Yes,19,Credit Card,Weekly
8,9,26,Male,Coat,Outerwear,97,West Virginia,L,Silver,Summer,2.6,Yes,Express,Yes,Yes,8,Venmo,Annually
9,10,57,Male,Handbag,Accessories,31,Missouri,M,Pink,Spring,4.8,Yes,2-Day Shipping,Yes,Yes,4,Cash,Quarterly


In [5]:
data.dtypes

Customer ID                 int64
Age                         int64
Gender                     object
Item Purchased             object
Category                   object
Purchase Amount (USD)       int64
Location                   object
Size                       object
Color                      object
Season                     object
Review Rating             float64
Subscription Status        object
Shipping Type              object
Discount Applied           object
Promo Code Used            object
Previous Purchases          int64
Payment Method             object
Frequency of Purchases     object
dtype: object

In [6]:
#See the row with the missing value:
rows_with_missing_values = data[data.isnull().any(axis=1)]
rows_with_missing_values 

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases


In [7]:
# Drop the  rows with missing values
data.dropna(inplace=True)
data.head(1)

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly


In [8]:
data['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [9]:
#Mapping gender column:
gender_mapping={
    'Male': 1,
    'Female' : 0
}
data['Gender']=data['Gender'].map(gender_mapping)

In [10]:
pd.options.display.max_columns=None
pd.options.display.max_rows=None

In [11]:
# Shuffle the dataset
data= data.sample(frac=1, random_state=42).reset_index(drop=True)
data.head(5)


Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,Shirt,Clothing,31,Illinois,L,Teal,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,Hoodie,Clothing,50,Hawaii,L,Orange,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,Sneakers,Footwear,36,Vermont,L,Orange,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly
3,3188,67,0,Blouse,Clothing,72,Texas,L,Indigo,Winter,3.2,No,Free Shipping,No,No,28,Debit Card,Weekly
4,2270,49,1,Skirt,Clothing,38,Kentucky,M,Yellow,Fall,4.4,No,Store Pickup,No,No,27,Credit Card,Annually


In [12]:
data['Item Purchased'].unique()

array(['Shirt', 'Hoodie', 'Sneakers', 'Blouse', 'Skirt', 'Socks', 'Pants',
       'Jeans', 'Coat', 'Belt', 'Gloves', 'Handbag', 'Sunglasses',
       'Sweater', 'Boots', 'Dress', 'Backpack', 'Sandals', 'Hat',
       'Jacket', 'Shorts', 'Shoes', 'Scarf', 'Jewelry', 'T-shirt'],
      dtype=object)

In [13]:
data['Category'].unique()

array(['Clothing', 'Footwear', 'Outerwear', 'Accessories'], dtype=object)

In [14]:
item_mapping = {
    'Shirt': 1,
    'Hoodie': 2,
    'Sneakers': 3,
    'Blouse': 4,
    'Skirt': 5,
    'Socks': 6,
    'Pants': 7,
    'Jeans': 8,
    'Coat': 9,
    'Belt': 10,
    'Gloves': 11,
    'Handbag': 12,
    'Sunglasses': 13,
    'Sweater': 14,
    'Boots': 15,
    'Dress': 16,
    'Backpack': 17,
    'Sandals': 18,
    'Hat': 19,
    'Jacket': 20,
    'Shorts': 21,
    'Shoes': 22,
    'Scarf': 23,
    'Jewelry': 24,
    'T-shirt': 25
}

category_mapping = {
    'Accessories': 1,
    'Clothing': 2,
    'Footwear': 3,
    'Outerwear': 4
}

# Map Item Purchased to numerical values
data['Item Purchased'] = data['Item Purchased'].map(item_mapping)
data['Category']=data['Category'].map(category_mapping)

In [15]:
data.head(10)

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,Illinois,L,Teal,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,Hawaii,L,Orange,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,Vermont,L,Orange,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly
3,3188,67,0,4,2,72,Texas,L,Indigo,Winter,3.2,No,Free Shipping,No,No,28,Debit Card,Weekly
4,2270,49,1,5,2,38,Kentucky,M,Yellow,Fall,4.4,No,Store Pickup,No,No,27,Credit Card,Annually
5,367,64,1,6,2,36,Nebraska,M,Turquoise,Spring,3.5,Yes,Store Pickup,Yes,Yes,37,Venmo,Fortnightly
6,2645,25,1,7,2,83,New Mexico,M,Turquoise,Spring,3.3,No,Standard,No,No,4,Bank Transfer,Every 3 Months
7,1750,43,1,2,2,70,Wyoming,L,Pink,Fall,3.2,No,2-Day Shipping,No,No,36,Debit Card,Every 3 Months
8,3606,68,0,8,2,41,Alabama,M,Gold,Fall,4.3,No,Store Pickup,No,No,36,Bank Transfer,Every 3 Months
9,1097,57,1,9,4,46,New Hampshire,S,Beige,Fall,2.6,No,Standard,Yes,Yes,48,Credit Card,Annually


In [16]:
data.rename(columns={'Item Purchased': 'Item(code) Purchased','Category':'Item Category'}, inplace=True)
data.head(3)

Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,Illinois,L,Teal,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,Hawaii,L,Orange,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,Vermont,L,Orange,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly


In [17]:
data['Location'].unique()

array(['Illinois', 'Hawaii', 'Vermont', 'Texas', 'Kentucky', 'Nebraska',
       'New Mexico', 'Wyoming', 'Alabama', 'New Hampshire', 'New York',
       'Utah', 'Kansas', 'Colorado', 'Michigan', 'Montana', 'Indiana',
       'Oklahoma', 'Tennessee', 'Nevada', 'California', 'Maryland',
       'Alaska', 'Wisconsin', 'North Dakota', 'Missouri', 'Delaware',
       'Arkansas', 'Minnesota', 'Louisiana', 'Mississippi', 'Connecticut',
       'Idaho', 'Ohio', 'South Carolina', 'Iowa', 'Georgia', 'Washington',
       'Florida', 'South Dakota', 'Arizona', 'West Virginia', 'Maine',
       'New Jersey', 'Pennsylvania', 'Virginia', 'Rhode Island',
       'Massachusetts', 'North Carolina', 'Oregon'], dtype=object)

In [18]:
# Create a mapping for locations to integers based on the starting letter
location_mapping = {
    'Alabama': 1,
    'Alaska': 2,
    'Arizona': 3,
    'Arkansas': 4,
    'California': 5,
    'Colorado': 6,
    'Connecticut': 7,
    'Delaware': 8,
    'Florida': 9,
    'Georgia': 10,
    'Hawaii': 11,
    'Idaho': 12,
    'Illinois': 13,
    'Indiana': 14,
    'Iowa': 15,
    'Kansas': 16,
    'Kentucky': 17,
    'Louisiana': 18,
    'Maine': 19,
    'Maryland': 20,
    'Massachusetts': 21,
    'Michigan': 22,
    'Minnesota': 23,
    'Mississippi': 24,
    'Missouri': 25,
    'Montana': 26,
    'Nebraska': 27,
    'Nevada': 28,
    'New Hampshire': 29,
    'New Jersey': 30,
    'New Mexico': 31,
    'New York': 32,
    'North Carolina': 33,
    'North Dakota': 34,
    'Ohio': 35,
    'Oklahoma': 36,
    'Oregon': 37,
    'Pennsylvania': 38,
    'Rhode Island': 39,
    'South Carolina': 40,
    'South Dakota': 41,
    'Tennessee': 42,
    'Texas': 43,
    'Utah': 44,
    'Vermont': 45,
    'Virginia': 46,
    'Washington': 47,
    'West Virginia': 48,
    'Wisconsin': 49,
    'Wyoming': 50
}

# Map Location to numerical values
data['Location'] = data['Location'].map(location_mapping)

data.head(50)

Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,L,Teal,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,11,L,Orange,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,45,L,Orange,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly
3,3188,67,0,4,2,72,43,L,Indigo,Winter,3.2,No,Free Shipping,No,No,28,Debit Card,Weekly
4,2270,49,1,5,2,38,17,M,Yellow,Fall,4.4,No,Store Pickup,No,No,27,Credit Card,Annually
5,367,64,1,6,2,36,27,M,Turquoise,Spring,3.5,Yes,Store Pickup,Yes,Yes,37,Venmo,Fortnightly
6,2645,25,1,7,2,83,31,M,Turquoise,Spring,3.3,No,Standard,No,No,4,Bank Transfer,Every 3 Months
7,1750,43,1,2,2,70,50,L,Pink,Fall,3.2,No,2-Day Shipping,No,No,36,Debit Card,Every 3 Months
8,3606,68,0,8,2,41,1,M,Gold,Fall,4.3,No,Store Pickup,No,No,36,Bank Transfer,Every 3 Months
9,1097,57,1,9,4,46,29,S,Beige,Fall,2.6,No,Standard,Yes,Yes,48,Credit Card,Annually


In [19]:
data.head(3)

Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,L,Teal,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,11,L,Orange,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,45,L,Orange,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly


In [20]:
data['Size'].unique()

array(['L', 'M', 'S', 'XL'], dtype=object)

In [21]:
# Mapping for Size to numerical values
size_mapping = {
    'S': 1,
    'M': 2,
    'L': 3,
    'XL': 4
}

# Map Size to numerical values
data['Size'] = data['Size'].map(size_mapping)
data.head(3)


Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,3,Teal,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,11,3,Orange,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,45,3,Orange,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly


In [22]:
data['Color'].unique()

array(['Teal', 'Orange', 'Indigo', 'Yellow', 'Turquoise', 'Pink', 'Gold',
       'Beige', 'Gray', 'Black', 'Charcoal', 'Red', 'Magenta', 'Olive',
       'Cyan', 'Silver', 'Lavender', 'Purple', 'Blue', 'White', 'Maroon',
       'Violet', 'Green', 'Brown', 'Peach'], dtype=object)

In [23]:
# Mapping for Color to numerical values
color_mapping = {
    'Teal': 1,
    'Orange': 2,
    'Indigo': 3,
    'Yellow': 4,
    'Turquoise': 5,
    'Pink': 6,
    'Gold': 7,
    'Beige': 8,
    'Gray': 9,
    'Black': 10,
    'Charcoal': 11,
    'Red': 12,
    'Magenta': 13,
    'Olive': 14,
    'Cyan': 15,
    'Silver': 16,
    'Lavender': 17,
    'Purple': 18,
    'Blue': 19,
    'White': 20,
    'Maroon': 21,
    'Violet': 22,
    'Green': 23,
    'Brown': 24,
    'Peach': 25
}

# Map Color to numerical values
data['Color'] = data['Color'].map(color_mapping)
data.head(10)


Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,3,1,Spring,2.6,Yes,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,11,3,2,Summer,2.9,No,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,45,3,2,Summer,4.7,Yes,Free Shipping,Yes,Yes,48,Debit Card,Quarterly
3,3188,67,0,4,2,72,43,3,3,Winter,3.2,No,Free Shipping,No,No,28,Debit Card,Weekly
4,2270,49,1,5,2,38,17,2,4,Fall,4.4,No,Store Pickup,No,No,27,Credit Card,Annually
5,367,64,1,6,2,36,27,2,5,Spring,3.5,Yes,Store Pickup,Yes,Yes,37,Venmo,Fortnightly
6,2645,25,1,7,2,83,31,2,5,Spring,3.3,No,Standard,No,No,4,Bank Transfer,Every 3 Months
7,1750,43,1,2,2,70,50,3,6,Fall,3.2,No,2-Day Shipping,No,No,36,Debit Card,Every 3 Months
8,3606,68,0,8,2,41,1,2,7,Fall,4.3,No,Store Pickup,No,No,36,Bank Transfer,Every 3 Months
9,1097,57,1,9,4,46,29,1,8,Fall,2.6,No,Standard,Yes,Yes,48,Credit Card,Annually


In [24]:
data['Season'].unique()

array(['Spring', 'Summer', 'Winter', 'Fall'], dtype=object)

In [25]:
data['Subscription Status'].unique()

array(['Yes', 'No'], dtype=object)

In [26]:
# Mapping for Season to numerical values
season_mapping = {
    'Spring': 1,
    'Summer': 2,
    'Fall': 3,
    'Winter': 4
}

# Map Season to numerical values
data['Season'] = data['Season'].map(season_mapping)


In [27]:
# Mapping for Subscription Status to numerical values
subscription_status_mapping = {
    'Yes': 1,
    'No': 0
}

# Map Subscription Status to numerical values
data['Subscription Status'] = data['Subscription Status'].map(subscription_status_mapping)
data.head(5)

Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,3,1,1,2.6,1,Free Shipping,Yes,Yes,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,11,3,2,2,2.9,0,Express,No,No,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,45,3,2,2,4.7,1,Free Shipping,Yes,Yes,48,Debit Card,Quarterly
3,3188,67,0,4,2,72,43,3,3,4,3.2,0,Free Shipping,No,No,28,Debit Card,Weekly
4,2270,49,1,5,2,38,17,2,4,3,4.4,0,Store Pickup,No,No,27,Credit Card,Annually


In [28]:
data['Shipping Type'].unique()

array(['Free Shipping', 'Express', 'Store Pickup', 'Standard',
       '2-Day Shipping', 'Next Day Air'], dtype=object)

In [29]:
data['Discount Applied'].unique()

array(['Yes', 'No'], dtype=object)

In [30]:
data['Promo Code Used'].unique()

array(['Yes', 'No'], dtype=object)

In [31]:
# Mapping for Shipping Type to numerical values based on cost
shipping_type_mapping = {
    'Free Shipping': 0,
    'Standard': 1,
    '2-Day Shipping': 2,
    'Express': 3,
    'Next Day Air': 4,
    'Store Pickup': 5
}

# Map Shipping Type to numerical values
data['Shipping Type'] = data['Shipping Type'].map(shipping_type_mapping)
                                                 


In [32]:
# Mapping for Discount Applied to numerical values
discount_mapping = {
    'Yes': 1,
    'No': 0
}

# Map Discount Applied to numerical values
data['Discount Applied'] = data['Discount Applied'].map(discount_mapping)


In [33]:
# Mapping for Promo Code Used to numerical values
promo_code_mapping = {
    'Yes': 1,
    'No': 0
}

# Map Promo Code Used to numerical values
data['Promo Code Used'] = data['Promo Code Used'].map(promo_code_mapping)
data.head(10)


Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,3,1,1,2.6,1,0,1,1,10,Debit Card,Every 3 Months
1,1718,29,1,2,2,50,11,3,2,2,2.9,0,3,0,0,16,Debit Card,Bi-Weekly
2,322,41,1,3,3,36,45,3,2,2,4.7,1,0,1,1,48,Debit Card,Quarterly
3,3188,67,0,4,2,72,43,3,3,4,3.2,0,0,0,0,28,Debit Card,Weekly
4,2270,49,1,5,2,38,17,2,4,3,4.4,0,5,0,0,27,Credit Card,Annually
5,367,64,1,6,2,36,27,2,5,1,3.5,1,5,1,1,37,Venmo,Fortnightly
6,2645,25,1,7,2,83,31,2,5,1,3.3,0,1,0,0,4,Bank Transfer,Every 3 Months
7,1750,43,1,2,2,70,50,3,6,3,3.2,0,2,0,0,36,Debit Card,Every 3 Months
8,3606,68,0,8,2,41,1,2,7,3,4.3,0,5,0,0,36,Bank Transfer,Every 3 Months
9,1097,57,1,9,4,46,29,1,8,3,2.6,0,1,1,1,48,Credit Card,Annually


In [34]:
data['Payment Method'].unique()

array(['Debit Card', 'Credit Card', 'Venmo', 'Bank Transfer', 'Cash',
       'PayPal'], dtype=object)

In [35]:
data['Frequency of Purchases'].unique()

array(['Every 3 Months', 'Bi-Weekly', 'Quarterly', 'Weekly', 'Annually',
       'Fortnightly', 'Monthly'], dtype=object)

In [36]:
# Mapping for Payment Method to numerical values based on benefits
payment_method_mapping = {
    'Credit Card': 5,      # High benefit (most common and reliable)
    'Debit Card': 4,       # Good benefit (also common)
    'PayPal': 3,           # Moderate benefit (popular for online transactions)
    'Venmo': 2,            # Less common, but gaining popularity
    'Bank Transfer': 1,    # Less convenient for quick purchases
    'Cash': 0              # Least beneficial for online business
}

# Map Payment Method to numerical values
data['Payment Method'] = data['Payment Method'].map(payment_method_mapping)


In [37]:
# Mapping for Frequency of Purchases to numerical values based on benefits
frequency_of_purchases_mapping = {
    'Weekly': 6,           # High benefit (most frequent)
    'Bi-Weekly': 5,        # High benefit (frequent, once every 2 weeks)
    'Fortnightly': 5,      # Same as Bi-Weekly
    'Monthly': 4,          # Good benefit
    'Every 3 Months': 3,   # Moderate benefit
    'Quarterly': 2,        # Less frequent
    'Annually': 1          # Least beneficial for revenue
}

# Map Frequency of Purchases to numerical values
data['Frequency of Purchases'] = data['Frequency of Purchases'].map(frequency_of_purchases_mapping)
data.head(10)


Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,840,48,1,1,2,31,13,3,1,1,2.6,1,0,1,1,10,4,3
1,1718,29,1,2,2,50,11,3,2,2,2.9,0,3,0,0,16,4,5
2,322,41,1,3,3,36,45,3,2,2,4.7,1,0,1,1,48,4,2
3,3188,67,0,4,2,72,43,3,3,4,3.2,0,0,0,0,28,4,6
4,2270,49,1,5,2,38,17,2,4,3,4.4,0,5,0,0,27,5,1
5,367,64,1,6,2,36,27,2,5,1,3.5,1,5,1,1,37,2,5
6,2645,25,1,7,2,83,31,2,5,1,3.3,0,1,0,0,4,1,3
7,1750,43,1,2,2,70,50,3,6,3,3.2,0,2,0,0,36,4,3
8,3606,68,0,8,2,41,1,2,7,3,4.3,0,5,0,0,36,1,3
9,1097,57,1,9,4,46,29,1,8,3,2.6,0,1,1,1,48,5,1


In [38]:
import numpy as np

# Assuming Previous Purchases indicates the count of purchases before the current transaction.

# Feature: Total Spend (same as before, as it's the total of Purchase Amount)
data['Total Spend'] = data['Purchase Amount (USD)']  # Total spend is just the purchase amount for that transaction

# Feature: Average Spend (this would remain the same, but given the unique rows per customer, it's the same as total)
data['Average Spend'] = data['Total Spend']  # Since one row per customer, avg spend is the same as total

# Feature: Purchase Frequency (number of previous purchases + current one)
data['Purchase Frequency'] = data['Previous Purchases'] + 1  # Add 1 for the current purchase

# Feature: Customer Lifetime Value (CLV) - based only on the current purchase
data['Customer Lifetime Value'] = data['Average Spend'] * data['Purchase Frequency']

# Feature: Loyalty Score (based on frequency of purchases and subscription status)
data['Loyalty Score'] = np.where(data['Subscription Status'] == 1, data['Purchase Frequency'] * 2, data['Purchase Frequency'])
data.head(10)



Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Total Spend,Average Spend,Purchase Frequency,Customer Lifetime Value,Loyalty Score
0,840,48,1,1,2,31,13,3,1,1,2.6,1,0,1,1,10,4,3,31,31,11,341,22
1,1718,29,1,2,2,50,11,3,2,2,2.9,0,3,0,0,16,4,5,50,50,17,850,17
2,322,41,1,3,3,36,45,3,2,2,4.7,1,0,1,1,48,4,2,36,36,49,1764,98
3,3188,67,0,4,2,72,43,3,3,4,3.2,0,0,0,0,28,4,6,72,72,29,2088,29
4,2270,49,1,5,2,38,17,2,4,3,4.4,0,5,0,0,27,5,1,38,38,28,1064,28
5,367,64,1,6,2,36,27,2,5,1,3.5,1,5,1,1,37,2,5,36,36,38,1368,76
6,2645,25,1,7,2,83,31,2,5,1,3.3,0,1,0,0,4,1,3,83,83,5,415,5
7,1750,43,1,2,2,70,50,3,6,3,3.2,0,2,0,0,36,4,3,70,70,37,2590,37
8,3606,68,0,8,2,41,1,2,7,3,4.3,0,5,0,0,36,1,3,41,41,37,1517,37
9,1097,57,1,9,4,46,29,1,8,3,2.6,0,1,1,1,48,5,1,46,46,49,2254,49


In [39]:
data[data.isnull().any(axis=1)]


Unnamed: 0,Customer ID,Age,Gender,Item(code) Purchased,Item Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Total Spend,Average Spend,Purchase Frequency,Customer Lifetime Value,Loyalty Score


In [40]:
data.to_csv('step_1_Preprocessing.csv', index=False)