# 1. Data Preparation

In [None]:

import pandas as pd 
import numpy as np 
import os 
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 20) 
from sklearn import svm 
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

datapath = "data training.csv" 
if os.path.exists(datapath):
    print(f"data '{datapath}' sudah diupload")
else:
    print(f"data '{datapath}' belum diupload")

datapath_test = "data testing.csv" 
if os.path.exists(datapath_test):
    print(f"data '{datapath_test}' sudah diupload")
else:
    print(f"data '{datapath_test}' belum diupload")

In [None]:

data = pd.read_csv(datapath)
data.head()

In [None]:

data.shape

In [None]:

missing = data.isnull().sum()
for i in range(0, len(missing)-1):
    if missing[i]>0: 
        print('pada kolom', missing.index[i], 'terdapat missing value sebanyak', missing[i])

In [None]:

data = data.loc[~(data['Gerakan'].isnull())].reset_index().drop(columns=['index'])


missing = data.isnull().sum()
for i in range(0, len(missing)-1):
    if missing[i]>0: 
        print('pada kolom', missing.index[i], 'terdapat missing value sebanyak', missing[i])

print(data.shape)

In [None]:

data.info()

In [None]:
# 6. Check distribusi data
data.describe()

In [None]:

data['Gerakan'].value_counts()

In [None]:

X = data.drop(columns=['Gerakan'])
y = data['Gerakan']

le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.20, random_state=42)

# 2. Model Training

In [None]:

models = {"SVM": {"model": svm.LinearSVC(penalty='l2', random_state=42),
                 "params":{"C":[1,2]}},
          "Naive Bayes": {"model": GaussianNB()},
          "Random Forest": {"model": RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=10),
                            "params": {"n_estimators": [100, 200]}}
         }
scores = []
for model_name, model_params in models.items():
    model = model_params["model"]
    params = model_params.get("params", {})
    print("==="*15)
    print(f"Training {model_name}")
    clf = GridSearchCV(model, params, cv=3, n_jobs=-1)
    clf.fit(X_train, y_train)
    print(f"Best params {model_name} : {clf.best_params_}")
    y_pred = clf.predict(X_val)
    
  
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    cm = confusion_matrix(y_val, y_pred)
    
    scores.append({"model": model_name, "accuracy": accuracy, "precision": precision, "recall": recall})
    print(f"Accuracy: {np.round(accuracy,2)}")
    print(f"Precision: {np.round(precision,2)}")
    print(f"Sensitivity: {np.round(recall,2)}")
    
 
    plt.figure()
    plt.title(f"{model_name} - Confusion Matrix")
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
    plt.show()

# 3. Model Evaluation

In [None]:

scores_df = pd.DataFrame(scores)
scores_df

Karena model terbaik adalah Random Forest Maka kita akan membuat Model Random Forest untuk di test pada data test

In [None]:
from sklearn import svm 
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier 

rf = RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=10, n_estimators=100)
rf.fit(X, y_encoded)

nb = GaussianNB()
nb.fit(X, y_encoded)

svm = svm.LinearSVC(penalty='l2', random_state=42, C=1)
svm.fit(X, y_encoded)

In [None]:
data_test = pd.read_csv(datapath_test)

In [None]:

data_test = data_test.loc[~(data_test['Gerakan'].isnull())].reset_index().drop(columns=['index'])


missing = data_test.isnull().sum()
for i in range(0, len(missing)-1):
    if missing[i]>0: 
        print('pada kolom', missing.index[i], 'terdapat missing value sebanyak', missing[i])

print(data_test.shape)

In [None]:

X_test = data_test.drop(columns=['Gerakan'])
y_test = data_test['Gerakan']


y_test_encoded = le.fit_transform(y_test)

In [None]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)


accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')

cm = confusion_matrix(y_test_encoded, y_pred)

print("==="*15)
print(f"Testing {model_name}")
print(f"Accuracy: {np.round(accuracy,2)}")
print(f"Precision: {np.round(precision,2)}")
print(f"Recall: {np.round(recall,2)}")



plt.figure()
plt.title(f"{model_name} - Confusion Matrix")
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.show()

In [None]:
model_name  = 'SVM'
y_pred = svm.predict(X_test)
# y_pred_proba = svm.predict_proba(X_test)


accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')
# auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class='ovr')
auc = roc_auc_score(y_test_encoded, y_pred)
cm = confusion_matrix(y_test_encoded, y_pred)

print("==="*15)
print(f"Testing {model_name}")
print(f"Accuracy: {np.round(accuracy,2)}")
print(f"Precision: {np.round(precision,2)}")
print(f"Recall: {np.round(recall,2)}")
print(f"AUC: {np.round(auc,2)}\n")


plt.figure()
plt.title(f"{model_name} - Confusion Matrix")
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.show()

In [None]:
model_name  = 'Naive Bayes'
y_pred = nb.predict(X_test)
y_pred_proba = nb.predict_proba(X_test)


accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred, average='weighted')
recall = recall_score(y_test_encoded, y_pred, average='weighted')
# auc = roc_auc_score(y_test_encoded, y_pred_proba, multi_class='ovr')
auc = roc_auc_score(y_test_encoded, y_pred)
cm = confusion_matrix(y_test_encoded, y_pred)

print("==="*15)
print(f"Testing {model_name}")
print(f"Accuracy: {np.round(accuracy,2)}")
print(f"Precision: {np.round(precision,2)}")
print(f"Recall: {np.round(recall,2)}")
print(f"AUC: {np.round(auc,2)}\n")


plt.figure()
plt.title(f"{model_name} - Confusion Matrix")
sns.heatmap(cm, annot=True, fmt="d", xticklabels=le.classes_, yticklabels=le.classes_)
plt.show()