Tugas Besar Data Mining

Data Understanding


1. Read Data

In [32]:
import pandas as pd

df = pd.read_excel('Retail-Supply-Chain-Sales-Dataset.xlsx')


2. Karakteristik Data

In [None]:
df.head() #Menampilkan 5 baris pertama secara default


In [None]:
df.info() #Menampilkan informasi tentang kolom dan tipe data

In [None]:
df.describe() #Menampilkan statistik data seperti count, mean, std, min, max, dll

In [None]:
df[['Row ID','Sales','Quantity','Discount','Profit']].describe() #Menampilkan statistik untuk data kategorikal/ non-numerik

In [None]:
df.columns
df.shape

2. Premodeling

2.1 Labelling Data

In [69]:
x = df.drop(['Sales'], axis=1)
y = df['Sales']

In [None]:
duplicates = df.duplicated().sum()
print('Duplikat yang ditemukan:\n',duplicates)

In [None]:
data_null = df.isna().sum()
print(data_null)

In [None]:
import matplotlib.pyplot as plt

numerik_data = df.select_dtypes(include='number')

plt.figure(figsize=(10, 6))
plt.boxplot(numerik_data.values, labels=numerik_data.columns, vert=False)
plt.title('Boxplot Semua Kolom Outlier')
plt.xlabel('Nilai')

plt.show()

**Data Preparation**

In [None]:
data_duplikasi = df[df.duplicated()]
jumlah_duplikasi = df.duplicated().sum()
print("Data yang duplikat yaitu:\n", data_duplikasi)
print("Jumlah data duplikat adalah", jumlah_duplikasi)

missingvaluesum = df.isnull().sum()
print("Jumlah nilai null pada data untuk masing-masing variabel yaitu\n", missingvaluesum)

columns_to_drop = [
    "Order Date", "Ship Date", "Order ID",
    "Customer ID", "Customer Name", "Country",
    "Postal Code", "Retail Sales People",
    "Product ID"]

df_cleaned = df.drop(columns=columns_to_drop)
print(df_cleaned.head())

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_cleaned['Sub-Category (Numeric)'] = label_encoder.fit_transform(df_cleaned['Sub-Category'])
df_cleaned['State (Numeric)'] = label_encoder.fit_transform(df_cleaned['State'])

X_Produk= df_cleaned[['Sub-Category (Numeric)', 'Profit']]
X_State = df_cleaned[['State (Numeric)', 'Sales']]



**Studycase 1 Segmentasi Produk**

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_Produk)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
optimal_k = 3
kmeans_produk = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_produk.fit(X_Produk)

df_cleaned['Cluster_Produk'] = kmeans_produk.predict(X_Produk)

plt.figure(figsize=(8, 6))
for cluster in range(optimal_k):
    cluster_data = df_cleaned[df_cleaned['Cluster_Produk'] == cluster]
    plt.scatter(
        cluster_data['Sub-Category (Numeric)'],
        cluster_data['Profit'], 
        label=f'Cluster {cluster}', 
        alpha=0.7
    )

centers_produk = kmeans_produk.cluster_centers_
plt.scatter(
    centers_produk[:, 0],
    centers_produk[:, 1], 
    c='red', 
    s=200, 
    alpha=0.9, 
    label='Centroids'
)

plt.title('K-Means Clustering (Sub-Category vs Profit)')
plt.xlabel('Sub-Category (Numeric)')
plt.ylabel('Profit')
plt.legend()
plt.show()


Evaluasi

In [None]:
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(X_Produk, kmeans_produk.labels_)
print(f"Average Silhouette Score for k={optimal_k}: {silhouette_avg}")

Hyperparameter Tuning

In [None]:
param_grid = {
    'n_clusters': range(2, 8), 
    'init': ['k-means++', 'random'], 
    'max_iter': [300, 500, 1000] 
}

best_params = None
best_score = -1
best_model = None

for n_clusters in param_grid['n_clusters']:
    for init in param_grid['init']:
        for max_iter in param_grid['max_iter']:
            kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, random_state=42)
            kmeans.fit(X_Produk) 
            labels = kmeans.labels_ 
            score = silhouette_score(X_Produk, labels) 
            print(f"n_clusters={n_clusters}, init={init}, max_iter={max_iter}, silhouette_score={score:.3f}")

            if score > best_score:
                best_score = score
                best_params = {'n_clusters': n_clusters, 'init': init, 'max_iter': max_iter}
                best_model = kmeans

print("\nBest Parameters:", best_params)
print("Best Silhouette Score:", best_score)

In [None]:
kmeans_produk_tuning = KMeans(n_clusters=2, init='k-means++', max_iter=300, random_state=42)
kmeans_produk_tuning.fit(X_Produk) 

df_cleaned['Cluster_Produk_Tuning'] = kmeans_produk_tuning.predict(X_Produk) 


silhouette_avg = silhouette_score(X_Produk, df_cleaned['Cluster_Produk_Tuning']) 
print(f"Silhouette Score: {silhouette_avg:.3f}")


plt.figure(figsize=(8, 6))
for cluster in range(2):
    cluster_data = df_cleaned[df_cleaned['Cluster_Produk_Tuning'] == cluster]
    plt.scatter(
        cluster_data['Sub-Category (Numeric)'],
        cluster_data['Profit'], 
        label=f'Cluster {cluster}', 
        alpha=0.7
    )

centers_produk = kmeans_produk_tuning.cluster_centers_
plt.scatter(
    centers_produk[:, 0], 
    centers_produk[:, 1], 
    c='red', 
    s=200, 
    alpha=0.9, 
    label='Centroids'
)

plt.title('Clustering with Best Parameters')
plt.xlabel('Sub-Category')
plt.ylabel('Profit')
plt.legend()
plt.show()


In [None]:
result_table = df_cleaned[['Product Name', 'Sub-Category', 'Cluster_Produk']]
result_table_sorted = result_table.sort_values(by='Cluster_Produk')
result_table_sorted.to_excel('result_table_cluster_produk.xlsx', index=False)
print("File berhasil disimpan sebagai 'result_table_cluster_produk.xlsx'")

**Studycase 2. Clustering State Pelanggan**

In [None]:
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_State)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()


In [None]:
optimal_k = 3
kmeans_state = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_state.fit(X_State)

df_cleaned['Cluster_State'] = kmeans_state.predict(X_State)

plt.figure(figsize=(10, 6))
for cluster in range(optimal_k):
    cluster_data = df_cleaned[df_cleaned['Cluster_State'] == cluster]
    plt.scatter(
        cluster_data['State (Numeric)'], 
        cluster_data['Sales'], 
        label=f'Cluster {cluster}', 
        alpha=0.7
    )

centers_state = kmeans_state.cluster_centers_
plt.scatter(
    centers_state[:, 0],
    centers_state[:, 1], 
    c='red', 
    s=200, 
    alpha=0.9, 
    label='Centroids'
)


plt.title('K-Means Clustering (State vs Sales)')
plt.xlabel('State (Encoded)')
plt.ylabel('Sales')
plt.legend()
plt.show()


In [None]:
silhouette_avg = silhouette_score(X_State, kmeans_state.labels_)
print(f"Average Silhouette Score for k={optimal_k}: {silhouette_avg}")


In [None]:
param_grid = {
    'n_clusters': range(2, 8),
    'init': ['k-means++', 'random'], 
    'max_iter': [300, 500, 1000] 
}

best_params = None
best_score = -1
best_model = None

for n_clusters in param_grid['n_clusters']:
    for init in param_grid['init']:
        for max_iter in param_grid['max_iter']:
            kmeans = KMeans(n_clusters=n_clusters, init=init, max_iter=max_iter, random_state=42)
            kmeans.fit(X_State)
            labels = kmeans.labels_
            score = silhouette_score(X_State, labels)
            print(f"n_clusters={n_clusters}, init={init}, max_iter={max_iter}, silhouette_score={score:.3f}")
            
            if score > best_score:
                best_score = score
                best_params = {'n_clusters': n_clusters, 'init': init, 'max_iter': max_iter}
                best_model = kmeans

print("\nBest Parameters:", best_params)
print("Best Silhouette Score:", best_score)


In [None]:
kmeans_state_tuning = KMeans(n_clusters=2, init='random', max_iter=300, random_state=42)
kmeans_state_tuning.fit(X_State)

df_cleaned['Cluster_State_Tuning'] = kmeans_state_tuning.predict(X_State)

plt.figure(figsize=(10, 6))
for cluster in range(4):
    cluster_data = df_cleaned[df_cleaned['Cluster_State_Tuning'] == cluster]
    plt.scatter(
        cluster_data['State (Numeric)'], 
        cluster_data['Sales'], 
        label=f'Cluster {cluster}', 
        alpha=0.7
    )

centers = kmeans_state_tuning.cluster_centers_
plt.scatter(
    centers[:, 0],
    centers[:, 1], 
    c='red', 
    s=200, 
    alpha=0.9, 
    label='Centroids'
)
plt.title('K-Means Clustering (State vs Sales)')
plt.xlabel('State (Encoded)')
plt.ylabel('Sales')
plt.legend()
plt.show()


In [None]:
silhouette_avg = silhouette_score(X_State, df_cleaned['Cluster_State_Tuning'])
print(f"Silhouette Score: {silhouette_avg:.3f}")

In [None]:
result_table = df_cleaned[['State', 'Sales', 'Cluster_State']]
result_table_sorted = result_table.sort_values(by='Cluster_State')
result_table_sorted.to_excel('result_table_cluster_state.xlsx', index=False)
print("File berhasil disimpan sebagai 'result_table_cluster_state.xlsx'")

**Studycase 3. Prediksi Pengembalian Produk oleh Pelanggan**

**MODELING**

1. Melakukan transformasi data Label Encoding untuk kolom kategorikal.

In [13]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score ,recall_score

In [None]:
columns_to_drop = [
    "Row ID", "Order Date", "Ship Date", "Order ID",
    "Customer ID", "Customer Name", "Country",
    "Postal Code", "Retail Sales People",
    "Product ID", "Product Name"]

df_cleaned = df.drop(columns=columns_to_drop)
print(df_cleaned.head())

In [None]:
df_encoded = pd.get_dummies(df_cleaned, columns=['Ship Mode', 'Segment', 'City', 'State', 'Region', 'Category', 'Sub-Category', 'Returned'])
print(tabulate(df_encoded, headers='keys', tablefmt='pretty'))

2. Melakukan pemilihan fitur yang relevan untuk model, yaitu Sales, Quantity, Discount, Ship Mode, Segment, Region, Category, Sub Category untuk yang independen (x), dan Returned untuk yang dependen (y).

In [None]:
x = df_encoded.drop(['Returned_Yes', 'Returned_Not'], axis=1)
y = df_encoded[['Returned_Not', 'Returned_Yes']]
df_encoded.head()

3. Membagi data independen dan dependen menjadi data latih dan data uji, dengan pembagian 80% train dan 20% test.

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
temp_size = 0.5  # 10% / (10% + 10%)
x_simulation, x_test, y_simulation, y_test = train_test_split(x_temp, y_temp, test_size=temp_size, random_state=42)
print("Training set size:", x_train.shape)
print("Validation set size:", x_simulation.shape)
print("Test set size:", x_test.shape)

4. Melatih model dengan algoritma Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train, y_train)

**EVALUATION**

Melakukan uji metrik terhadap model, menggunakan accuracy, precision, recall, F1-score, atau confusion matrix.


1. Accuracy mengukur seberapa banyak prediksi model yang benar dibandingkan dengan total jumlah prediksi yang dilakukan.

In [None]:
y_pred = rf_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy = {accuracy}')

2. Precision mengukur seberapa akurat model dalam memprediksi kelas positif (misalnya, produk yang dikembalikan).

In [None]:
prec = precision_score(y_test, y_pred, average='micro')
print(f'Precision Score = {prec}')

3. Recall mengukur seberapa baik model dalam mendeteksi kelas positif. 

In [None]:
rec = recall_score(y_test, y_pred, average='micro')
print(f'Recall Score = {rec}')

4. F1-Score adalah rata-rata harmonis antara precision dan recall, dan memberikan gambaran yang lebih seimbang antara keduanya.

In [None]:
f1score = f1_score(y_test, y_pred, average='micro')
print(f'f1_score = {f1score}')

5. Confusion matrix adalah tabel yang menggambarkan hasil prediksi model dengan membandingkan nilai yang diprediksi dan nilai yang sebenarnya.

In [None]:
y_test_binary = y_test['Returned_Yes']
y_pred_binary = y_pred[:, 1]
conf_mat = confusion_matrix(y_test_binary, y_pred_binary)
print(f'Confusion Matrix = {conf_mat}')

Evaluation ke Data Simulasi

In [None]:
y_simulation_pred = rf_model.predict(x_simulation)
simulation_accuracy = accuracy_score(y_simulation, y_simulation_pred)
print(f'Simulation Accuracy = {simulation_accuracy}')

**Support Vector Machine**

In [None]:
from sklearn.svm import SVC
x = df_encoded.drop(['Returned_Yes', 'Returned_Not'], axis=1)
y = df_encoded['Returned_Yes']
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
temp_size = 0.5 
x_simulation, x_test, y_simulation, y_test = train_test_split(x_temp, y_temp, test_size=temp_size, random_state=42)
print("Training set size:", x_train.shape)
print("Validation set size:", x_simulation.shape)
print("Test set size:", x_test.shape)

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
svc_model = SVC(kernel='linear', probability=True, random_state=42)
svc_model.fit(x_train, y_train)

In [None]:
y_pred = svc_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy = {accuracy}')

prec = precision_score(y_test, y_pred, average='binary')
print(f'Precision Score = {prec}')

rec = recall_score(y_test, y_pred, average='binary')
print(f'Recall Score = {rec}')

f1score = f1_score(y_test, y_pred, average='binary')
print(f'f1_score = {f1score}')

y_pred_binary = (svc_model.predict_proba(x_test)[:, 1] > 0.5).astype(int)
conf_mat = confusion_matrix(y_test, y_pred_binary)
print(f'Confusion Matrix = {conf_mat}')

**Naive-Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
x = df_encoded.drop(['Returned_Yes', 'Returned_Not'], axis=1)
y = df_encoded['Returned_Yes']
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
temp_size = 0.5 
x_simulation, x_test, y_simulation, y_test = train_test_split(x_temp, y_temp, test_size=temp_size, random_state=42)
print("Training set size:", x_train.shape)
print("Validation set size:", x_simulation.shape)
print("Test set size:", x_test.shape)

In [None]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

In [None]:
y_pred = nb_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy = {accuracy}')

prec = precision_score(y_test, y_pred, average='binary')
print(f'Precision Score = {prec}')

rec = recall_score(y_test, y_pred, average='binary')
print(f'Recall Score = {rec}')

f1score = f1_score(y_test, y_pred, average='binary')
print(f'f1_score = {f1score}')

y_pred_binary = y_pred
conf_mat = confusion_matrix(y_test, y_pred_binary)
print(f'Confusion Matrix = {conf_mat}')

**DEPLOYMENT**

Import Library yg digunakan

In [26]:
import streamlit as st
import pickle

from PIL import Image

Menyimpan Model di Pickle

In [27]:
with open('kmeansprodukmodel.pkl', 'wb') as file:
    pickle.dump(kmeans_produk, file)

with open('kmeansstatemodel.pkl', 'wb') as file:
    pickle.dump(kmeans_state, file)

with open('rfmodel.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

Inisiasi Variable

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle
import pandas as pd

columns_to_drop = [
    "Row ID", "Order Date", "Ship Date", "Order ID",
    "Customer ID", "Customer Name", "Country",
    "Postal Code", "Retail Sales People",
    "Product ID", "Product Name"
]


df_cleaned = df.drop(columns=columns_to_drop)

label_encoder = LabelEncoder()
df_cleaned['Sub-Category (Numeric)'] = label_encoder.fit_transform(df_cleaned['Sub-Category'])
df_cleaned['State (Numeric)'] = label_encoder.fit_transform(df_cleaned['State'])

X_Produk= df_cleaned[['Sub-Category (Numeric)', 'Profit']]
X_State = df_cleaned[['State (Numeric)', 'Sales']]

kmeansproduk_pipeline = Pipeline(steps=[
    ('kmeansprodukmodel', KMeans(n_clusters=3, random_state=42))
])

kmeansproduk_pipeline.fit(X_Produk)
df_cleaned['Cluster_Produk'] = kmeansproduk_pipeline.named_steps['kmeansprodukmodel'].predict(X_Produk)

with open('kmeansproduk_pipeline.pkl', 'wb') as file:
    pickle.dump(kmeansproduk_pipeline, file)

print("Pipeline untuk K-Means telah dilatih dan disimpan.")

# Menentukan kolom kategorikal dan numerik
categorical_columns = ['Ship Mode', 'Segment', 'City', 'State', 'Region', 'Category', 'Sub-Category']
numeric_columns = ['Sales', 'Quantity', 'Discount', 'Profit']

# Memisahkan fitur dan label
X = df_cleaned.drop(columns=['Returned'])
y = df_cleaned['Returned']

# Membagi data menjadi training, testing, dan simulasi
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
temp_size = 0.5  # Membagi data temp menjadi simulasi dan test
X_simulation, X_test, y_simulation, y_test = train_test_split(X_temp, y_temp, test_size=temp_size, random_state=42)

# Pipeline untuk preprocessing kolom kategorikal dan numerik
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ], remainder='passthrough'
)

# Membuat pipeline untuk preprocessing dan prediksi
rfmodel_pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('rfmodel', RandomForestClassifier(random_state=42))
])

# Melatih model dengan data training
rfmodel_pipeline.fit(X_train, y_train)

# Menyimpan pipeline yang sudah dilatih
with open('rfmodel_pipeline.pkl', 'wb') as file:
    pickle.dump(rfmodel_pipeline, file)

print("Model telah dilatih dan disimpan.")
