#Import Library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import Basic and most-used Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import auth
import gspread
from google.auth import default

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import RobustScaler, OneHotEncoder

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

#Import Dataset

In [None]:
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [None]:
sh = gc.open_by_url("...")
ws = sh.worksheet('Sheet2')

In [None]:
df = pd.DataFrame(ws.get_all_records())
pd.set_option("display.max_columns", None)
df.head()

In [None]:
df.info()

In [None]:
df = df.drop(['customer_id','dari segmen mana','...'], axis = 1)

... account di-takeout karena valuenya 0 semua

In [None]:
pd.set_option("display.max_columns", None)
df

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['label'].unique()

In [None]:
df['label'].value_counts()

In [None]:
num_columns = ['...'
]

cat_columns = ['...']

#EDA

##Correlation Heatmap

In [None]:
plt.figure(figsize=(20,15))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, fmt=".2f", linewidth=.5, cmap="crest")
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)

##Attributes Histogram

In [None]:
df.hist(bins=30, figsize=(15,15))
plt.show()

##BoxPlot

In [None]:
plt.figure(figsize=(20,8))
for i in enumerate(num_columns):
  plt.subplot(5,5,i[0]+1)
  sns.boxplot(x = df[i[1]]);

In [None]:
# ringkasan statistik dari kolom numerik
df[num_columns].describe()

In [None]:
# categorical columns
df[cat_columns].describe()

#Data Cleansing

##Missing Data

In [None]:
# jumlah entry NULL di setiap kolom
df.isna().sum()

##Data Duplicate

In [None]:
df.duplicated().sum()

In [None]:
# drop duplicated rows
print(f'Jumlah row duplicated sebelum dihapus {df.duplicated().sum()}')
df.drop_duplicates(inplace=True) # note: default-nya keep='First' (tidak perlu ditulis)
print(f'Jumlah row duplicated SETELAH dihapus {df.duplicated().sum()}')

##Outlier Detection

In [None]:
from scipy import stats

In [None]:
print(f'Jumlah baris sebelum memfilter outlier: {len(df)}')

filtered_entries = np.array([True] * len(df))

for col in df[num_columns]:
    zscore = abs(stats.zscore(df[col])) # hitung absolute z-scorenya
    filtered_entries = (zscore < 3) & filtered_entries # keep yang kurang dari 3 absolute z-scorenya

df = df[filtered_entries] # filter, cuma ambil yang z-scorenya dibawah 3

print(f'Jumlah baris setelah memfilter outlier: {len(df)}')

#Feature Engineering

##Label Encoding

"Class" column is str so we need to convert the labels into a numeric form.

In [None]:
labelencoder = LabelEncoder()
df["label"] = labelencoder.fit_transform(df['label'])

df.head(400)

In [None]:
df.info()

In [None]:
df['label'].value_counts()

#Data Splitting

In [None]:
X = df.drop(['label'], axis = 1)
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 42)

#Pipeline

In [None]:
# Based on EDA there are several columns that we choose for Machine Learning Modelling
num_columns = [
    '...'
]

cat_columns = ['...']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler()),
    #('poly', PolynomialFeatures(degree=3, include_bias=False)),
    #('power', PowerTransformer(method='yeo-johnson'))
])

categoric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ('numeric', numeric_pipeline, num_columns),
    ('categoric', categoric_pipeline, cat_columns)
])

pipeSVM = Pipeline([
    ("prep", preprocessor),
    ("algo", SVC(max_iter=400, probability=True))
])

pipeLR = Pipeline([
    ("prep", preprocessor),
    ("algo", LogisticRegression())
])

pipeKNN = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsClassifier())
])

pipeDT = Pipeline([
    ("prep", preprocessor),
    ("algo", DecisionTreeClassifier())
])

pipeRF = Pipeline([
    ("prep", preprocessor),
    ("algo", RandomForestClassifier())
])

pipeXG = Pipeline([
    ("prep", preprocessor),
    ("algo", xgb.XGBClassifier())
])

#Base Model (KNN)

In [None]:
pipeKNN.fit(X_train, y_train)

In [None]:
y_KNN_train = pipeKNN.predict(X_train)

In [None]:
y_KNN_test = pipeKNN.predict(X_test)

In [None]:
accuracy_train_KNN_base = accuracy_score(y_train, y_KNN_train)
accuracy_train_KNN_base

In [None]:
accuracy_test_KNN_base = accuracy_score(y_test, y_KNN_test)
accuracy_test_KNN_base

In [None]:
f1score_train_KNN_base = f1_score(y_train, y_KNN_train, average='weighted')
f1score_train_KNN_base

In [None]:
f1score_test_KNN_base = f1_score(y_test, y_KNN_test, average='weighted')
f1score_test_KNN_base

In [None]:
print(classification_report(y_test, y_KNN_test))

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_KNN_test)
plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .5, square = True)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Accuracy: {0}'.format(round(accuracy_test_KNN_base, 2)))
plt.show()

#Base Model Logistic Regression

In [None]:
pipeLR.fit(X_train, y_train)

In [None]:
y_LR_train = pipeLR.predict(X_train)

In [None]:
y_LR_test = pipeLR.predict(X_test)

In [None]:
accuracy_train_LR_base = accuracy_score(y_train, y_LR_train)
accuracy_train_LR_base

In [None]:
accuracy_test_LR_base = accuracy_score(y_test, y_LR_test)
accuracy_test_LR_base

In [None]:
f1score_train_LR_base = f1_score(y_train, y_LR_train, average='weighted')
f1score_train_LR_base

In [None]:
f1score_test_LR_base = f1_score(y_test, y_LR_test, average='weighted')
f1score_test_LR_base

In [None]:
print(classification_report(y_test, y_LR_test))

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_LR_test)
plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .5, square = True)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Accuracy: {0}'.format(round(accuracy_test_LR_base, 2)))
plt.show()

#Base Model SVM

In [None]:
pipeSVM.fit(X_train, y_train)

In [None]:
y_SVM_train = pipeSVM.predict(X_train)

In [None]:
y_SVM_test = pipeSVM.predict(X_test)

In [None]:
accuracy_train_SVM_base = accuracy_score(y_train, y_SVM_train)
accuracy_train_SVM_base

In [None]:
accuracy_test_SVM_base = accuracy_score(y_test, y_SVM_test)
accuracy_test_SVM_base

In [None]:
f1score_train_SVM_base = f1_score(y_train, y_SVM_train, average='weighted')
f1score_train_SVM_base

In [None]:
f1score_test_SVM_base = f1_score(y_test, y_SVM_test, average='weighted')
f1score_test_SVM_base

In [None]:
print(classification_report(y_test, y_SVM_test))

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_SVM_test)
plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .5, square = True)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Accuracy: {0}'.format(round(accuracy_test_SVM_base, 2)))
plt.show()

#Base Model (DecisionTreeClassifier)

In [None]:
pipeDT.fit(X_train, y_train)

In [None]:
y_DT_train = pipeDT.predict(X_train)

In [None]:
y_DT_test = pipeDT.predict(X_test)

In [None]:
accuracy_train_DT_base = accuracy_score(y_train, y_DT_train)
accuracy_train_DT_base

In [None]:
accuracy_test_DT_base = accuracy_score(y_test, y_DT_test)
accuracy_test_DT_base

In [None]:
f1score_train_DT_base = f1_score(y_train, y_DT_train, average='weighted')
f1score_train_DT_base

In [None]:
f1score_test_DT_base = f1_score(y_test, y_DT_test, average='weighted')
f1score_test_DT_base

In [None]:
print(classification_report(y_test, y_DT_test))

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_DT_test)
plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .5, square = True)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Accuracy: {0}'.format(round(accuracy_test_DT_base, 2)))
plt.show()

#Base Model (Random Forest Classifier)

In [None]:
pipeRF.fit(X_train, y_train)

In [None]:
y_RF_train = pipeRF.predict(X_train)

In [None]:
y_RF_test = pipeRF.predict(X_test)

In [None]:
accuracy_train_RF_base = accuracy_score(y_train, y_RF_train)
accuracy_train_RF_base

In [None]:
accuracy_test_RF_base = accuracy_score(y_test, y_RF_test)
accuracy_test_RF_base

In [None]:
f1score_train_RF_base = f1_score(y_train, y_RF_train, average='weighted')
f1score_train_RF_base

In [None]:
f1score_test_RF_base = f1_score(y_test, y_RF_test, average='weighted')
f1score_test_RF_base

In [None]:
print(classification_report(y_test, y_RF_test))

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_RF_test)
plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .5, square = True)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Accuracy: {0}'.format(round(accuracy_test_RF_base, 2)))
plt.show()

#Base Model (XGBoost Classifier)

In [None]:
pipeXG.fit(X_train, y_train)

In [None]:
y_XG_train = pipeXG.predict(X_train)

In [None]:
y_XG_test = pipeXG.predict(X_test)

In [None]:
accuracy_train_XG_base = accuracy_score(y_train, y_XG_train)
accuracy_train_XG_base

In [None]:
accuracy_test_XG_base = accuracy_score(y_test, y_XG_test)
accuracy_test_XG_base

In [None]:
f1score_train_XG_base = f1_score(y_train, y_XG_train, average='weighted')
f1score_train_XG_base

In [None]:
f1score_test_XG_base = f1_score(y_test, y_XG_test, average='weighted')
f1score_test_XG_base

In [None]:
print(classification_report(y_test, y_XG_test))

In [None]:
# Plotting the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_XG_test)
plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot = True, fmt = '.0f', linewidths = .5, square = True)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Accuracy: {0}'.format(round(accuracy_test_XG_base, 2)))
plt.show()

#Predict

##Save Model

In [None]:
import pickle

In [None]:
# Simpan model ke file
with open('pipeLR.pkl', 'wb') as file:
    pickle.dump(pipeLR, file)

In [None]:
# Memuat model dari file
with open('pipeLR.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [None]:
sh1 = gc.open_by_url("...")
ws1 = sh1.worksheet('...')

In [None]:
X_new = pd.DataFrame(ws1.get_all_records())
pd.set_option("display.max_columns", None)
X_new.head()

##Model Prediction

In [None]:
# Melakukan prediksi pada data baru
predictions = loaded_model.predict(X_new)

In [None]:
predictions

In [None]:
X_new['predictions'] = predictions

In [None]:
proba = loaded_model.predict_proba(X_new)

In [None]:
proba

In [None]:
# Menggabungkan probabilitas dengan dataset awal
combined_data = np.concatenate((X_new, proba), axis=1)

# Membuat dataframe dari data yang digabungkan
columns = ['...'] + ['Probabilitas Kelas 1', 'Probabilitas Kelas 2', 'Probabilitas Kelas 3','Probabilitas Kelas 4']
df_combined = pd.DataFrame(data=combined_data, columns=columns)

# Menampilkan dataframe yang telah digabungkan
print(df_combined)

In [None]:
df_combined

##Export ke google drive

In [None]:
# Menyimpan DataFrame ke file Excel
df_combined.to_excel('predicted.xlsx', index=False)

In [None]:
print("Done")

Done
