In [1]:
import pandas as pd

column_names = ['Image', 'Attention']

df1 = pd.read_csv('attention.csv', header=None, names=column_names)
df2 = pd.read_csv('./featurebycolor/color_stats.csv')
df3 = pd.read_csv('./featurebycolor/contrast_features.csv')
df4 = pd.read_csv('./featurebyshape/shape_features.csv')

In [2]:
import os

df1.rename(columns={'Image': 'image_name'}, inplace=True)
df1['image_name'] = df1['image_name'].apply(lambda x: int(os.path.splitext(x.split('/')[-1])[0]))
df4['image_name'] = df4['image_name'].apply(lambda x: int(os.path.splitext(x.split('/')[-1])[0]))


In [3]:
# Menggabungkan df1 dan df2
merged_df = pd.merge(df1, df2, on='image_name', how='inner')

# Menggabungkan hasilnya dengan df3
merged_df = pd.merge(merged_df, df3, on='image_name', how='inner')

# Menggabungkan hasilnya dengan df4
merged_df = pd.merge(merged_df, df4, on='image_name', how='inner')


In [4]:
merged_df = merged_df.dropna()

In [5]:
import ast

def expand_list_column(df, list_column):
    # Konversi string menjadi list
    df[list_column] = df[list_column].apply(ast.literal_eval)
    
    # Pisahkan list menjadi kolom terpisah dan tambahkan ke dataframe
    for i in range(8):  # Asumsi list panjangnya 8
        df[f'{list_column}_col_{i+1}'] = df[list_column].apply(lambda x: x[i] if len(x) > i else None)

    # Hapus kolom asli jika tidak diperlukan
    df = df.drop(columns=[list_column])
    
    return df

# Terapkan fungsi
merged_df = expand_list_column(merged_df, 'edge_orientation_histogram')
merged_df = merged_df.drop(columns=['image_name'])


In [6]:
merged_df = merged_df.dropna()

In [7]:
from sklearn.preprocessing import LabelEncoder

# Kolom kategorikal yang perlu diproses
categorical_columns = ['dominant_text_position', 'font_variety_estimation', 'readability']

# Label Encoding untuk kolom yang memiliki urutan
label_encoder = LabelEncoder()

# Lakukan Label Encoding pada kolom kategorikal dan ubah menjadi tipe int
for col in categorical_columns:
    merged_df[col] = label_encoder.fit_transform(merged_df[col].astype(str))

# Pastikan tipe data kolom menjadi int
merged_df[categorical_columns] = merged_df[categorical_columns].astype(int)


In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Pisahkan fitur dan target
X = merged_df.drop(columns=['Attention'])
y = merged_df['Attention']

# Membagi data menjadi data latih dan data uji (80% latih, 20% uji)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Distribusi kelas:", y_train.value_counts())


Distribusi kelas: Attention
4    83
1    49
2    44
3    37
0    25
Name: count, dtype: int64


In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Inisialisasi model XGBoost
xgb = XGBClassifier(random_state=42, enable_categorical=False)  # disable categorical mode

# Tentukan distribusi hyperparameters yang ingin diuji
param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'min_child_weight': [1, 3, 5],
    'scale_pos_weight': [1, 5, 10],  # Menyeimbangkan kelas yang tidak seimbang
}

# Gunakan RandomizedSearchCV untuk mencari kombinasi hyperparameter terbaik
random_search = RandomizedSearchCV(
    estimator=xgb, 
    param_distributions=param_dist, 
    n_iter=50,  # Jumlah iterasi pencarian
    cv=5,  # Cross-validation
    n_jobs=-1,  # Gunakan semua core CPU
    scoring='accuracy', 
    verbose=2,
    random_state=42
)

# Latih model dengan RandomizedSearchCV
random_search.fit(X_train, y_train)

# Tampilkan hyperparameter terbaik yang ditemukan
print("Best Hyperparameters:", random_search.best_params_)

# Gunakan model terbaik yang ditemukan untuk prediksi
best_xgb_random = random_search.best_estimator_
y_pred = best_xgb_random.predict(X_test)

# Evaluasi model
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Hyperparameters: {'subsample': 0.9, 'scale_pos_weight': 5, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 15, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Akurasi: 0.4166666666666667
Laporan Klasifikasi:
               precision    recall  f1-score   support

           0       0.50      0.25      0.33         4
           1       0.09      0.17      0.12         6
           2       0.33      0.30      0.32        10
           3       0.14      0.10      0.12        10
           4       0.61      0.63      0.62        30

    accuracy                           0.42        60
   macro avg       0.34      0.29      0.30        60
weighted avg       0.43      0.42      0.42        60



In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Inisialisasi model Random Forest
rf = RandomForestClassifier(random_state=42)

# Tentukan distribusi hyperparameters yang ingin diuji
param_dist = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']  # Menyeimbangkan kelas jika tidak seimbang
}

# Gunakan RandomizedSearchCV untuk mencari kombinasi hyperparameter terbaik
random_search = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_dist, 
    n_iter=50,  # Jumlah iterasi pencarian
    cv=5,  # Cross-validation
    n_jobs=-1,  # Gunakan semua core CPU
    scoring='accuracy', 
    verbose=2,
    random_state=42
)

# Latih model dengan RandomizedSearchCV
random_search.fit(X_train, y_train)

# Tampilkan hyperparameter terbaik yang ditemukan
print("Best Hyperparameters:", random_search.best_params_)

# Gunakan model terbaik yang ditemukan untuk prediksi
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluasi model
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 30, 'class_weight': None, 'bootstrap': True}
Akurasi: 0.5166666666666667
Laporan Klasifikasi:
               precision    recall  f1-score   support

           0       0.50      0.25      0.33         4
           1       0.33      0.33      0.33         6
           2       0.36      0.40      0.38        10
           3       0.00      0.00      0.00        10
           4       0.59      0.80      0.68        30

    accuracy                           0.52        60
   macro avg       0.36      0.36      0.34        60
weighted avg       0.42      0.52      0.46        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Inisialisasi model KNN
knn = KNeighborsClassifier()

# Tentukan distribusi hyperparameters
param_dist = {
    'n_neighbors': list(range(1, 31)),  # Jumlah tetangga
    'weights': ['uniform', 'distance'],  # Bobot
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Metode pengukuran jarak
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=2,
    random_state=42
)

# Latih model
random_search.fit(X_train, y_train)

# Tampilkan hyperparameter terbaik
print("Best Hyperparameters:", random_search.best_params_)

# Gunakan model terbaik untuk prediksi
best_knn = random_search.best_estimator_
y_pred = best_knn.predict(X_test)

# Evaluasi
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'weights': 'distance', 'n_neighbors': 25, 'metric': 'manhattan'}
Akurasi: 0.43333333333333335
Laporan Klasifikasi:
               precision    recall  f1-score   support

           0       0.17      0.25      0.20         4
           1       0.14      0.17      0.15         6
           2       0.10      0.10      0.10        10
           3       0.00      0.00      0.00        10
           4       0.66      0.77      0.71        30

    accuracy                           0.43        60
   macro avg       0.21      0.26      0.23        60
weighted avg       0.37      0.43      0.40        60



In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Inisialisasi model MLPClassifier
mlp = MLPClassifier(random_state=42)

# Tentukan distribusi hyperparameters yang ingin diuji
param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (150,), (200,), (500), (1000)],  # Jumlah neuron di setiap layer
    'activation': ['relu', 'tanh'],  # Fungsi aktivasi
    'solver': ['adam', 'sgd'],  # Optimizer yang digunakan
    'alpha': [0.0001, 0.001, 0.01],  # Regularisasi
    'learning_rate': ['constant', 'invscaling', 'adaptive'],  # Laju pembelajaran
    'max_iter': [200, 300, 500, 1000, 2000]  # Iterasi maksimum untuk pelatihan
}

# Gunakan RandomizedSearchCV untuk mencari kombinasi hyperparameter terbaik
random_search = RandomizedSearchCV(
    estimator=mlp, 
    param_distributions=param_dist, 
    n_iter=50,  # Jumlah iterasi pencarian
    cv=5,  # Cross-validation
    n_jobs=-1,  # Gunakan semua core CPU
    scoring='accuracy', 
    verbose=2,
    random_state=42
)

# Latih model dengan RandomizedSearchCV
random_search.fit(X_train, y_train)

# Tampilkan hyperparameter terbaik yang ditemukan
print("Best Hyperparameters:", random_search.best_params_)

# Gunakan model terbaik yang ditemukan untuk prediksi
best_mlp_random = random_search.best_estimator_
y_pred = best_mlp_random.predict(X_test)

# Evaluasi model
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Laporan Klasifikasi:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'solver': 'sgd', 'max_iter': 300, 'learning_rate': 'constant', 'hidden_layer_sizes': (50,), 'alpha': 0.0001, 'activation': 'tanh'}
Akurasi: 0.48333333333333334
Laporan Klasifikasi:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00        10
           3       0.00      0.00      0.00        10
           4       0.49      0.97      0.65        30

    accuracy                           0.48        60
   macro avg       0.10      0.19      0.13        60
weighted avg       0.25      0.48      0.33        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
