In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
df = pd.read_csv(r'C:\Users\acer\Learning\Python_Contents\DataSets\retail_data.csv')

In [104]:
print(df.iloc[0])


Transaction_ID                8691788.0
Customer_ID                     37249.0
Name                Michelle Harrington
Email                 Ebony39@gmail.com
Phone                      1414786801.0
Address               3959 Amanda Burgs
City                           Dortmund
State                            Berlin
Zipcode                         77985.0
Country                         Germany
Age                                21.0
Gender                             Male
Income                              Low
Customer_Segment                Regular
Date                          9/18/2023
Year                             2023.0
Month                         September
Time                           22:03:55
Total_Purchases                     3.0
Amount                       108.028757
Total_Amount                  324.08627
Product_Category               Clothing
Product_Brand                      Nike
Product_Type                     Shorts
Feedback                      Excellent


In [106]:
print(df.columns.tolist())


['Transaction_ID', 'Customer_ID', 'Name', 'Email', 'Phone', 'Address', 'City', 'State', 'Zipcode', 'Country', 'Age', 'Gender', 'Income', 'Customer_Segment', 'Date', 'Year', 'Month', 'Time', 'Total_Purchases', 'Amount', 'Total_Amount', 'Product_Category', 'Product_Brand', 'Product_Type', 'Feedback', 'Shipping_Method', 'Payment_Method', 'Order_Status', 'Ratings', 'products']


In [108]:
# Clean and parse dates
def clean_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%m/%d/%Y')
    except:
        return pd.to_datetime(date_str, format='%d-%m-%y')

df['Date'] = df['Date'].apply(clean_date)

In [110]:
# Time features
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['DayOfMonth'] = df['Date'].dt.day
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
df['Month'] = df['Date'].dt.month
df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.hour

In [112]:
# Handle categorical data
cat_cols = ['Gender', 'Customer_Segment', 'Product_Category', 'Payment_Method', 'Shipping_Method', 'Order_Status']
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

# Convert Income to numeric
df['Income'] = pd.to_numeric(df['Income'], errors='coerce')
df = df.dropna(subset=['Income'])

In [114]:
# Customer-level features
customer_features = df.groupby('Customer_ID').agg({
    'Age': 'first',
    'Income': 'first',
    'Gender': 'first'
}).reset_index()

In [134]:
# Time-series features
ts_data = df.groupby(['Customer_ID', 'Date']).agg({
    'Total_Amount': 'sum',
    'DayOfWeek': 'first',
    'IsWeekend': 'first',
    'Month': 'first',
    'Hour': 'first',
    'Product_Category': lambda x: x.mode()[0],
    'Payment_Method': lambda x: x.mode()[0]
}).reset_index()

KeyError: "Column(s) ['DayOfWeek', 'Hour', 'IsWeekend'] do not exist"

In [130]:
ts_data

Unnamed: 0,Customer_ID,Date,Total_Amount,DayOfWeek,IsWeekend,Month,Hour,Product_Category,Payment_Method,Age_x,Income_x,Gender_x,Age_y,Income_y,Gender_y,Age,Income,Gender


In [124]:
print("🔍 Unique Customer_IDs in customer_features:", customer_features['Customer_ID'].nunique())
print("🔍 Unique Customer_IDs in ts_data:", ts_data['Customer_ID'].nunique())

common_ids = set(ts_data['Customer_ID']).intersection(set(customer_features['Customer_ID']))
print("✅ Number of matching Customer_IDs:", len(common_ids))


🔍 Unique Customer_IDs in customer_features: 0
🔍 Unique Customer_IDs in ts_data: 0
✅ Number of matching Customer_IDs: 0


In [128]:
import pandas as pd

file_path = r'C:\Users\acer\Learning\Python_Contents\DataSets\retail_data.csv'
df = pd.read_csv(file_path)

print("Rows:", len(df))
print(df.sample(3))  # if rows > 0


Rows: 302010
        Transaction_ID  Customer_ID               Name                 Email  \
48716        3909387.0      25917.0     Lauren Collins    Nathan98@gmail.com   
43997        9209374.0      31747.0  Victoria Williams      Gail93@gmail.com   
106270       8286848.0      37352.0        Jacob Gates  Samantha69@gmail.com   

               Phone                       Address       City         State  \
48716   9.902247e+09  5240 Danielle Ville Apt. 408  Cleveland  Pennsylvania   
43997   1.514641e+09             9866 Carlos Trace   Edmonton       Ontario   
106270  8.100041e+09     85364 Stephanie Crossroad   Winnipeg       Ontario   

        Zipcode Country  ...  Total_Amount Product_Category Product_Brand  \
48716   19544.0     USA  ...    269.733309          Grocery         Pepsi   
43997   23353.0  Canada  ...    863.749949            Books  Random House   
106270  72237.0  Canada  ...   1393.217741       Home Decor    Home Depot   

       Product_Type   Feedback  Shipping

In [122]:
# Clean ts_data before merge
ts_data = ts_data.drop(columns=['Age', 'Gender', 'Income'], errors='ignore')

# Ensure Customer_ID is float
ts_data['Customer_ID'] = ts_data['Customer_ID'].astype(float)
customer_features['Customer_ID'] = customer_features['Customer_ID'].astype(float)

# Drop rows with missing critical values
customer_features = customer_features.dropna(subset=['Age', 'Income', 'Gender'])

# Merge safely
ts_data = pd.merge(ts_data, customer_features, on='Customer_ID', how='inner')

# Debug print
print("✅ ts_data shape after merge:", ts_data.shape)

# Define all numeric features
numeric_cols = ['Total_Amount', 'DayOfWeek', 'IsWeekend', 'Month', 'Hour', 'Age', 'Income', 'Product_Category', 'Payment_Method', 'Gender']

# Normalize
scaler = MinMaxScaler()
ts_data[numeric_cols] = scaler.fit_transform(ts_data[numeric_cols])

✅ ts_data shape after merge: (0, 18)


ValueError: Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by MinMaxScaler.

In [78]:
# Create sequences
def create_sequences(data, customer_id, seq_length=10):
    customer_data = data[data['Customer_ID'] == customer_id].sort_values('Date')
    if len(customer_data) < seq_length:
        return None, None
    
    features = numeric_cols + ['Product_Category', 'Payment_Method', 'Gender', 'Income']
    X, y = [], []
    for i in range(len(customer_data) - seq_length):
        seq = customer_data.iloc[i:i+seq_length][features]
        target = customer_data.iloc[i+seq_length]['Total_Amount']
        X.append(seq.values)
        y.append(target)
    return np.array(X), np.array(y)

In [98]:
# Prepare data for a specific customer
customer_id = 48453.0  # Michelle Harrington's ID
X, y = create_sequences(ts_data, customer_id)

if X is not None:
    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    model = Sequential([
        Input(shape=(X_train.shape[1], X_train.shape[2])),
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.3),
        LSTM(32, activation='relu'),
        Dense(1)
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    history = model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
    )

    y_pred = model.predict(X_test)
    y_test_actual = scaler.inverse_transform(
        np.concatenate([y_test.reshape(-1, 1), np.zeros((len(y_test), len(numeric_cols) - 1))], axis=1)
    )[:, 0]
    y_pred_actual = scaler.inverse_transform(
        np.concatenate([y_pred, np.zeros((len(y_pred), len(numeric_cols) - 1))], axis=1)
    )[:, 0]

    plt.figure(figsize=(12, 6))
    plt.plot(y_test_actual, label='Actual Spending')
    plt.plot(y_pred_actual, label='Predicted Spending', linestyle='--')
    plt.title(f'Spending Prediction for Customer {customer_id}')
    plt.ylabel('Amount ($)')
    plt.xlabel('Days')
    plt.legend()
    plt.show()

    # Forecast next 7 days
    last_sequence = X[-1]
    forecast = []
    for _ in range(7):
        next_pred = model.predict(last_sequence.reshape(1, *last_sequence.shape))
        forecast.append(next_pred[0, 0])

        new_row = last_sequence[-1].copy()
        new_row[0] = next_pred[0, 0]  # Update Total_Amount
        new_row[1] = (new_row[1] + 1) % 7  # Update DayOfWeek
        new_row[2] = 1 if new_row[1] in [5, 6] else 0  # Update IsWeekend
        last_sequence = np.vstack([last_sequence[1:], new_row])

    forecast_amounts = scaler.inverse_transform(
        np.concatenate([np.array(forecast).reshape(-1, 1),
                        np.zeros((7, len(numeric_cols) - 1))], axis=1)
    )[:, 0]

    print("\nNext 7 Days Forecast:")
    for i, amount in enumerate(forecast_amounts):
        print(f"Day {i + 1}: ${amount:.2f}")

else:
    print(f"Not enough data for customer {customer_id} (need at least 30 transactions)")


ValueError: could not convert string to float: 'Medium'

In [68]:
ts_data[ts_data['Customer_ID'] == 37249.0].shape[0]


5

In [70]:
customer_counts = ts_data['Customer_ID'].value_counts()
eligible_customers = customer_counts[customer_counts >= 30].index.tolist()

print(eligible_customers[:5])  # Shows first 5 customers with >= 30 transactions


[]


In [72]:
print(ts_data['Customer_ID'].value_counts().describe())


count    86753.000000
mean         3.430913
std          1.728154
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         13.000000
Name: count, dtype: float64


In [74]:
eligible_customers = customer_counts[customer_counts >= 10].index.tolist()
print(eligible_customers[:5])


[48453.0, 49274.0, 99355.0, 60341.0, 35276.0]


In [94]:
print(X_train.dtype)
print(y_train.dtype)


object
float64
