# Import Dataset

In [None]:
# DataFrame
import pandas as pd 

In [None]:
# Construct a tf.data.Dataset
data = pd.read_csv(r"marketing_campaign2.csv",encoding='latin')

In [None]:
data

In [None]:
data.info()

# Data Cleaning

In [None]:
# Check all the categorical columns from the dataset
def count_unique_values(column_name):
    value_counts = data[column_name].value_counts()
    print(value_counts)

# Call the function for different columns
count_unique_values("Education")
print("\n")
count_unique_values("Marital_Status")

## Convert Categorical to Numerical

In [None]:
# Convert the date column to a numerical format
#data["Dt_Customer"] = pd.to_datetime(data["Dt_Customer"], format="%d-%m-%Y").dt.strftime("%d%m%Y").astype(int)

In [None]:
data

In [None]:
# Define the mapping for Education
education_mapping = {
    'Graduation': 1,
    'PhD': 2,
    'Master': 3,
    '2n Cycle': 4,
    'Basic': 5
}

# Define the mapping for Marital_Status
marital_status_mapping = {
    'Married': 1,
    'Together': 2,
    'Single': 3,
    'Divorced': 4,
    'Widow': 5,
    'Alone': 6,
    'Absurd': 7,
    'YOLO': 8,
}



# Replace Education and Marital_Status with numerical values
data['Education'] = data['Education'].replace(education_mapping)
data['Marital_Status'] = data['Marital_Status'].replace(marital_status_mapping)

# Print the updated dataframe
data

## Check Missing Values

In [None]:
# Periksa apakah ada nilai yang hilang dalam dataset
missing_values = data.isnull().sum()

# Tampilkan jumlah nilai yang hilang untuk setiap kolom
print(missing_values)

In [None]:
# Hapus baris yang memiliki missing value pada kolom "income"
data = data.dropna(subset=["Income"])

# Tampilkan dataset setelah menghapus baris dengan missing value pada kolom "income"
data

In [None]:
# Periksa apakah ada nilai yang hilang dalam dataset
missing_values = data.isnull().sum()

# Tampilkan jumlah nilai yang hilang untuk setiap kolom
print(missing_values)

# Langkah 4

In [None]:
import datetime

## Customer_For

In [None]:
# Mengubah kolom "Dt_Customer" menjadi tipe data datetime
#data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'])

# Menghitung jumlah hari pelanggan terdaftar hingga tanggal terakhir dalam catatan
#latest_date = data['Dt_Customer'].max()
#data['Customer_For'] = (latest_date - data['Dt_Customer']).dt.days

data['Dt_Customer'] = pd.to_datetime(data['Dt_Customer'], format='%d-%m-%Y')
latest_date = data['Dt_Customer'].max()
data['Customer_For'] = (latest_date - data['Dt_Customer']).dt.days

In [None]:
data['Customer_For']

## Age

In [None]:
# Menghitung usia pelanggan berdasarkan tahun lahir (Year_Birth)
current_year = datetime.datetime.now().year
data['Age'] = current_year - data['Year_Birth']

In [None]:
data['Age']

## Spent

In [None]:
# Menjumlahkan total pengeluaran pelanggan dalam berbagai kategori selama rentang waktu dua tahun
categories = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
data['Spent'] = data[categories].sum(axis=1)

In [None]:
data[['Spent', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']]

## Living_With

In [None]:
# Define the mapping for Marital_Status
# marital_status_mapping = {
#     'Married': 1,
#     'Together': 2,
#     'Single': 3,
#     'Divorced': 4,
#     'Widow': 5,
#     'Alone': 6,
#     'Absurd': 7,
#     'YOLO': 8,
# }

# Mengatur nilai kolom "Living_With" berdasarkan kondisi marital status, teenhome, dan kidhome
data.loc[data['Marital_Status'].isin([3, 4, 5, 6]), 'Living_With'] = 0 #'Alone'
data.loc[data['Marital_Status'].isin([1, 2]), 'Living_With'] = 1 #'With_Partner'
data.loc[data['Marital_Status'].isin([7, 8]), 'Living_With'] = 2 #'Absurd'

In [None]:
data

## Children

In [None]:
# Menjumlahkan jumlah anak dalam sebuah rumah tangga
data['Children'] = data['Kidhome'] + data['Teenhome']

## Family_Size

In [None]:
# Menghitung jumlah orang dalam keluarga berdasarkan kondisi Living_With
data.loc[data['Living_With'] == 0, 'Family_Size'] = data['Children']
data.loc[data['Living_With'] == 1, 'Family_Size'] = data['Children'] + 1
data.loc[data['Living_With'] == 2, 'Family_Size'] = data['Children']

In [None]:
data['Family_Size']

## Is_Parent

In [None]:
# Mengecek apakah pelanggan merupakan orang tua atau bukan
data['Is_Parent'] = data['Children'].apply(lambda x: '1' if x > 0 else '0')
# Yes = 1, No = 0

In [None]:
# Convert Is_Parent to int
data['Is_Parent'] = data['Is_Parent'].astype(int)

In [None]:
data[['Children', 'Is_Parent']]

## Education

In [None]:
data['Education']

In [None]:
# Define the mapping for Education
# education_mapping = {
#     'Graduation': 1,
#     'PhD': 2,
#     'Master': 3,
#     '2n Cycle': 4,
#     'Basic': 5
# }

# Mengatur nilai kolom "Living_With" berdasarkan kondisi marital status, teenhome, dan kidhome
data.loc[data['Education'].isin([1]), 'Education_Category'] = 0 #'Bachelor'
data.loc[data['Education'].isin([2, 3, 4]), 'Education_Category'] = 1 #'Advanced'
data.loc[data['Education'].isin([5]), 'Education_Category'] = 2 #'High School'

In [None]:
data['Education_Category']

## Menghapus fitur-fitur yang redundan:

In [None]:
data.info()

In [None]:
redundant_features = ['ID', 'Year_Birth', 'Dt_Customer', 'Education', 'Marital_Status', 'Kidhome', 'Teenhome','MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
data = data.drop(redundant_features, axis=1)

In [None]:
data.info()

# Langkah e

In [None]:
data.describe()

# Langkah f

In [None]:
# Import Library
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
selected_columns = ["Income", "Recency", "Customer_For", "Age", "Spent", "Is_Parent"]

for column in selected_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=data, y=column)
    plt.title(f"Box Plot of {column}")
    plt.show()


# Langkah g

In [None]:
from scipy import stats

# Calculate z-scores for the specified columns
z_scores_income = stats.zscore(data['Income'])
z_scores_age = stats.zscore(data['Age'])
z_scores_spent = stats.zscore(data['Spent'])

print(z_scores_income)

# Set threshold range for z-scores (e.g., within 3 standard deviations)
threshold = 3
filtered_data = data[
    (z_scores_income > -threshold) & (z_scores_income < threshold) &
    (z_scores_age > -threshold) & (z_scores_age < threshold) &
    (z_scores_spent > -threshold) & (z_scores_spent < threshold)
]

# Check the number of outliers removed for each column
outliers_removed_income = len(data) - len(filtered_data)
outliers_removed_age = len(data) - len(filtered_data)
outliers_removed_spent = len(data) - len(filtered_data)
print(f"Number of outliers removed from 'Income': {outliers_removed_income}")
print(f"Number of outliers removed from 'Age': {outliers_removed_age}")
print(f"Number of outliers removed from 'Spent': {outliers_removed_spent}")


## Lihat kembali boxplot pada kolom yang di proses

In [None]:
selected_columns = ["Income", "Age", "Spent"]

for column in selected_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=filtered_data, y=column)
    plt.title(f"Box Plot of {column} (Outliers Removed)")
    plt.show()


In [None]:
filtered_data

# Langkah h

In [None]:
import seaborn as sns

columns = ['Income', 'Recency', 'Customer_For', 'Age', 'Spent']

# Create a correlation matrix
correlation_matrix = filtered_data[columns].corr()

# Generate the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Non-Categorical Features')
plt.show()


# Langkah i

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

# Create subset dataframe by removing related offer and promotion features
subset_df = filtered_data.drop(['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response'], axis=1)

# Normalize the features using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(subset_df)


In [None]:
subset_df.info()

# Langkah j

In [None]:
from sklearn.decomposition import PCA

# Perform dimensionality reduction using PCA with 3 components
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(scaled_data)

# Create a new dataframe with the reduced data
reduced_df = pd.DataFrame(reduced_data, columns=['PC1', 'PC2', 'PC3'])

In [None]:
reduced_df

# Langkah k

## K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Perform K-Means clustering with different values of k
wcss = []
k_values = range(1, 10)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(reduced_df)
    wcss.append(kmeans.inertia_)

# Plot the Elbow curve to find the optimal number of clusters
plt.plot(k_values, wcss, 'bx-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Curve')
plt.show()


In [None]:
y_kmeans = kmeans.fit_predict(reduced_df)
# Visualize the clustering result using a scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(reduced_df['PC1'], reduced_df['PC2'], reduced_df['PC3'], c=y_kmeans)
ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
           kmeans.cluster_centers_[:, 2], s=300, c='yellow', label='Centroids')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('K-means Clustering')
plt.legend()
plt.show()

### 3D Visualization

In [None]:
# #Install plotly library if not installed
# %pip install plotly
# import plotly.express as px

# # Create a dataframe for the scatter plot
# scatter_df = reduced_df.copy()
# scatter_df['Cluster'] = y_kmeans

# # Create the interactive 3D scatter plot
# fig = px.scatter_3d(scatter_df, x='PC1', y='PC2', z='PC3', color='Cluster',
#                     symbol='Cluster', opacity=0.8)

# # Add centroids to the plot
# centroid_df = pd.DataFrame(kmeans.cluster_centers_, columns=['PC1', 'PC2', 'PC3'])
# centroid_df['Cluster'] = range(kmeans.n_clusters)
# fig.add_trace(px.scatter_3d(centroid_df, x='PC1', y='PC2', z='PC3', color='Cluster',
#                             symbol='Cluster', size_max=10).data[0])

# # Update layout and axis labels
# fig.update_layout(title='K-means Clustering',
#                   scene=dict(xaxis_title='PC1', yaxis_title='PC2', zaxis_title='PC3'))

# # Show the interactive plot
# fig.show()


## Agglomerative Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Perform Agglomerative clustering and plot the dendrogram
linkage_matrix = linkage(reduced_df, method='ward')
dendrogram(linkage_matrix)
plt.title('Dendrogram')
plt.xlabel('Data Points')
plt.ylabel('Distance')

# Determine the number of clusters for different height thresholds
heights = [0.1, 5, 10, 25, 50, 80]
for height in heights:
    num_clusters = len(set(linkage_matrix[:, 2][linkage_matrix[:, 2] > height])) + 1
    plt.axhline(y=height, color='r', linestyle='--')
    plt.text(linkage_matrix[-1, 2], height, f'Clusters: {num_clusters}', ha='left', va='center')

plt.show()


## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

# Perform DBSCAN clustering with different values of epsilon and min_samples
eps_values = [0.3, 0.5, 0.7]
min_samples_values = [2, 5, 7, 9, 10, 15]
for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan.fit(reduced_df)
        labels = dbscan.labels_
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # Subtract 1 for noise points
        
        # Plot the clusters
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(reduced_df['PC1'], reduced_df['PC2'], reduced_df['PC3'], c=labels)
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        ax.set_zlabel('PC3')
        ax.set_title(f'DBSCAN Clustering (epsilon={eps}, min_samples={min_samples})')
        ax.text2D(0.95, 0.05, f'Clusters: {num_clusters}', ha='right', va='center', transform=ax.transAxes)
        plt.show()


### 3D visualization

In [None]:
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.cluster import DBSCAN

# Perform DBSCAN clustering with different values of epsilon and min_samples
eps_values = [0.3, 0.5, 0.7]
min_samples_values = [2, 5, 7, 9, 10, 15]
for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan.fit(reduced_df)
        labels = dbscan.labels_
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # Subtract 1 for noise points
        
        # Create the interactive 3D scatter plot
        fig = go.Figure(data=go.Scatter3d(
            x=reduced_df['PC1'],
            y=reduced_df['PC2'],
            z=reduced_df['PC3'],
            mode='markers',
            marker=dict(
                size=3,
                color=labels,
                colorscale='Viridis',
                opacity=0.8
            )
        ))

        # Update layout and axis labels
        fig.update_layout(
            title=f'DBSCAN Clustering (epsilon={eps}, min_samples={min_samples})',
            scene=dict(
                xaxis_title='PC1',
                yaxis_title='PC2',
                zaxis_title='PC3'
            )
        )
        
        # Add text annotation for the number of clusters
        fig.add_annotation(
            x=0.95,
            y=0.05,
            text=f'Clusters: {num_clusters}',
            showarrow=False,
            font=dict(color='black')
        )
        
        # Set the aspect ratio of the 3D plot
        fig.update_layout(scene=dict(aspectmode='data'))
        
        # Show the interactive plot
        pio.show(fig)
