In [1]:
import pandas as pd
from datetime import datetime

# --- 1. Load Both Datasets ---
try:
    customers_df = pd.read_csv('AWCustomers.csv')
    sales_df = pd.read_csv('AWSales.csv')
    print("✅ Successfully loaded AWCustomers.csv and AWSales.csv")
except FileNotFoundError as e:
    print(f"❌ Error: Could not find a file. Make sure both CSVs are in the directory. Details: {e}")
    exit()

# --- 2. Create the 'BikeBuyer' column ---
# Get a unique list of customer IDs from the sales data
customers_who_bought = sales_df['CustomerID'].unique()

# Create the 'BikeBuyer' column in the customers dataframe.
# If a customer's ID is in our list of buyers, set BikeBuyer to 1, otherwise 0.
customers_df['BikeBuyer'] = customers_df['CustomerID'].apply(
    lambda id: 1 if id in customers_who_bought else 0
)
print("✅ 'BikeBuyer' column created successfully.")

# --- 3. Create the 'Age' column from 'BirthDate' ---
# Convert the 'BirthDate' column to a proper date format
customers_df['BirthDate'] = pd.to_datetime(customers_df['BirthDate'])

# Calculate age based on today's date
today = datetime.now()
customers_df['Age'] = customers_df['BirthDate'].apply(
    lambda birth_date: today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
)
print("✅ 'Age' column calculated successfully.")


# --- 4. Select the Final, Correct Features ---
# This is our corrected list of columns that actually exist in the data
final_selected_features = [
    'MaritalStatus', 'Gender', 'YearlyIncome', 'TotalChildren',
    'NumberChildrenAtHome', 'Education', 'Occupation',
    'HomeOwnerFlag', 'NumberCarsOwned', 'Age', # Note: 'Region' and 'CommuteDistance' were removed
    'BikeBuyer' # Our newly created target variable
]

# Create our final, clean DataFrame
df = customers_df[final_selected_features]

# --- 5. Display and Save the Clean Data ---
print("\n--- Success! Final Clean Data Frame is Ready ---")
print(df.head())

print("\nInformation about the final DataFrame:")
df.info()

# Save the cleaned dataframe to a new CSV file for the next steps
df.to_csv('cleaned_customer_data.csv', index=False)
print("\n✅ Final cleaned data has been saved to 'cleaned_customer_data.csv'")

  

✅ Successfully loaded AWCustomers.csv and AWSales.csv
✅ 'BikeBuyer' column created successfully.
✅ 'Age' column calculated successfully.

--- Success! Final Clean Data Frame is Ready ---
  MaritalStatus Gender  YearlyIncome  TotalChildren  NumberChildrenAtHome  \
0             M      M         81916              1                     0   
1             M      M         81076              2                     1   
2             S      F         86387              0                     0   
3             M      M         61481              2                     1   
4             S      M         51804              0                     0   

         Education      Occupation  HomeOwnerFlag  NumberCarsOwned  Age  \
0        Bachelors        Clerical              1                3   37   
1  Partial College        Clerical              1                2   53   
2        Bachelors        Clerical              0                3   39   
3  Partial College  Skilled Manual              1 