This assignment covers data preprocessing with datasets contaning overlapping but different features. The goal is to augment, merge, and enhance the data while ensuring consistency in a machine learning pipeline.

TASK 1: Data Augmentation on CSV Files

In [1]:
# Loading the dataset
import pandas as pd

file_path = '../initial_dataset/customer_transactions.csv'

data = pd.read_csv(file_path)

ModuleNotFoundError: No module named 'pandas'

In [47]:
data

Unnamed: 0,customer_id_legacy,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,151,1001,408,2024-01-01,Sports,2.3
1,192,1002,332,2024-01-02,Electronics,4.2
2,114,1003,442,2024-01-03,Electronics,2.1
3,171,1004,256,2024-01-04,Clothing,2.8
4,160,1005,64,2024-01-05,Clothing,1.3
...,...,...,...,...,...,...
145,102,1146,88,2024-05-25,Sports,2.7
146,100,1147,387,2024-05-26,Books,4.6
147,104,1148,409,2024-05-27,Clothing,1.4
148,189,1149,178,2024-05-28,Sports,3.0


In [51]:
print(data.isnull().sum())

customer_id_legacy     0
transaction_id         0
purchase_amount        0
purchase_date          0
product_category       0
customer_rating       10
dtype: int64


In [3]:
from sklearn.impute import SimpleImputer


# Handle missing values
# Numerical columns: Impute with mean (purchase_amount, customer_rating)
num_cols = ['purchase_amount', 'customer_rating']

# Impute numerical columns with mean
num_imputer = SimpleImputer(strategy='mean')
data[num_cols] = num_imputer.fit_transform(data[num_cols])

# Categorical columns: Impute with mode (product_category)
cat_cols = ['product_category']
cat_imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = cat_imputer.fit_transform(data[cat_cols])

In [5]:
# Data Augmentation Strategies

import numpy as np

# Apply random noise to purchase_amount (±5%)
data['purchase_amount'] = data['purchase_amount'] * (1 + np.random.normal(0, 0.05, data.shape[0]))  # ±5% noise

# Apply log transformation to purchase_amount (log(1 + x))
data['purchase_amount'] = np.log1p(data['purchase_amount'])

# Expand data by generating synthetic transactions (slightly modify the existing ones)
synthetic_data = data.copy()

# Add slight variations to the purchase_amount and customer_rating for synthetic data
synthetic_data['purchase_amount'] *= np.random.uniform(0.9, 1.1, synthetic_data.shape[0])  # ±10% variation
synthetic_data['transaction_id'] = synthetic_data['transaction_id'].astype(str) + "_synth"  # Mark synthetic data

# Append the synthetic data to the original dataset
data = pd.concat([data, synthetic_data], ignore_index=True)

In [6]:
# Save the augmented dataset

print("\nExporting augmented dataset...")
data.to_csv('../augmented_dataset/customer_transactions_augmented.csv', index=False)
print("Augmented dataset saved as 'customer_transactions_augmented.csv'.")

augmented_data = pd.read_csv('../augmented_dataset/customer_transactions_augmented.csv')
print(augmented_data)


Exporting augmented dataset...
Augmented dataset saved as 'customer_transactions_augmented.csv'.
     customer_id_legacy transaction_id  purchase_amount purchase_date  \
0                   151           1001         6.005589    2024-01-01   
1                   192           1002         5.767591    2024-01-02   
2                   114           1003         5.985804    2024-01-03   
3                   171           1004         5.578930    2024-01-04   
4                   160           1005         4.177239    2024-01-05   
..                  ...            ...              ...           ...   
295                 102     1146_synth         4.489214    2024-05-25   
296                 100     1147_synth         6.533912    2024-05-26   
297                 104     1148_synth         6.185791    2024-05-27   
298                 189     1149_synth         5.488780    2024-05-28   
299                 113     1150_synth         5.125929    2024-05-29   

    product_category  cus

In [2]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

ModuleNotFoundError: No module named 'numpy'

In [None]:
transactions_df = pd.read_csv("../augmented_dataset/customer_transactions_augmented.csv")
social_profiles_df = pd.read_csv("../initial_dataset/customer_social_profiles.csv")
id_mapping_df = pd.read_csv("../initial_dataset/id_mapping.csv")

In [None]:
transactions_df.columns
social_profiles_df.columns
id_mapping_df.columns

Perform a complex merge

In [None]:
# Step 2: Clean ID Mapping
id_mapping_df = id_mapping_df.drop_duplicates(subset=['customer_id_legacy'], keep='first')

In [None]:
# 1: Merge transactions with ID mapping
merged_df = transactions_df.merge(id_mapping_df, on="customer_id_legacy", how="left")

# 2: Merge with social media profiles
final_df = merged_df.merge(social_profiles_df, on="customer_id_new", how="left")

# Handle missing social profiles (customers without social media data)
final_df.fillna({
    "social_media_platform": "Unknown",
    "engagement_score": final_df["engagement_score"].median(),
    "purchase_interest_score": final_df["purchase_interest_score"].median(),
    "review_sentiment": "Neutral"
}, inplace=True)

In [None]:

final_df.head(10)

In [None]:

print(final_df.isnull().sum())

In [None]:
final_df.info()

Feature Engineering & Tansformation

In [None]:
# Compute Customer Engagement Score
final_df["customer_engagement_score"] = (
    0.6 * final_df["engagement_score"] + 0.4 * final_df["purchase_interest_score"])

final_df["purchase_date"] = pd.to_datetime(final_df["purchase_date"])
# Compute Moving Average of Last 3 Transactions
final_df["moving_avg_purchase"] = final_df.groupby("customer_id_legacy")["purchase_amount"].transform(lambda x: x.rolling(3, min_periods=1).mean())

# Aggregate Monthly Spending
final_df["purchase_month"] = final_df["purchase_date"].dt.to_period("M")
monthly_spending = final_df.groupby(["customer_id_legacy", "purchase_month"])["purchase_amount"].agg(["sum", "mean"]).reset_index()
monthly_spending.rename(columns={"sum": "monthly_total_spend", "mean": "monthly_avg_spend"}, inplace=True)

# Merge Monthly Spending Data
final_df = final_df.merge(monthly_spending, on=["customer_id_legacy", "purchase_month"], how="left")

In [None]:
final_df = pd.get_dummies(final_df, columns=['product_category'])

In [None]:
vectorizer = TfidfVectorizer(max_features=10)
tfidf_matrix = vectorizer.fit_transform(final_df['review_sentiment'].fillna(''))

In [None]:

print(final_df.duplicated().sum())  # Count duplicate rows
print(final_df.isnull().sum())
final_df.info()  # Count missing values

In [None]:
# Replace 'moving_avg_purchase' with the actual column if different
final_df = final_df.drop_duplicates()
final_df['avg_spent_last_3'] = final_df['moving_avg_purchase']  # Create the column
final_df['avg_spent_last_3'] = final_df['avg_spent_last_3'].fillna(final_df['avg_spent_last_3'].median())

In [None]:

sns.histplot(final_df['purchase_amount'], bins=50, kde=True)
plt.title("Distribution of Purchase Amounts")
plt.show()

In [None]:
# Select only numeric columns for correlation
df_numeric = final_df.select_dtypes(include=[np.number])

# Calculate correlation matrix
corr_matrix = df_numeric.corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()

In [None]:

final_df.to_csv("final_customer_data_10.csv", index=False)