# Eksperimen dengan dataset public

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
data = 'dataset/adult.csv'
df = pd.read_csv(data)
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
        'marital_status', 'occupation', 'relationship', 'race', 'sex',
        'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
        'income']
df.columns = columns  # Set column names

# Preprocessing
# Mengubah kolom non-numerik menjadi numerik (dummy variables)
df = pd.get_dummies(df, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])

# Menghapus kolom yang tidak diperlukan
df.drop(['fnlwgt'], axis=1, inplace=True)

# Feature selection (contoh: menggunakan korelasi)
selected_features = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Checking if 'income' column exists after preprocessing
if 'income' in df.columns:
    X = df[selected_features]
    y = df['income']

    # Eksperimen dengan test_size yang berbeda
    results = []
    test_sizes = [0.2, 0.3, 0.5, 0.4, 0.6]
    for test_size in test_sizes:
        # Pembagian dataset menjadi train dan test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        # Inisialisasi dan pelatihan model Naive Bayes
        model = GaussianNB()
        model.fit(X_train, y_train)

        # Prediksi pada data uji
        y_pred = model.predict(X_test)

        # Menghitung akurasi
        accuracy = accuracy_score(y_test, y_pred)

        # Menghitung error
        error = 1 - accuracy

        # Menyimpan hasil
        results.append((test_size, accuracy, error))

    # Menampilkan hasil eksperimen
    print("Test_size, Akurasi, Error")
    for result in results:
        print("{}, {:.2f}%, {:.2f}".format(result[0], result[1] * 100, result[2]))
else:
    print("Column 'income' is not present in the DataFrame after preprocessing. Check preprocessing steps.")


Test_size, Akurasi, Error
0.2, 79.99%, 0.20
0.3, 80.01%, 0.20
0.5, 79.85%, 0.20
0.4, 79.89%, 0.20
0.6, 79.67%, 0.20


# Tingkat Akurasi Terbaik

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
data = 'dataset/adult.csv'
df = pd.read_csv(data)
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
          'marital_status', 'occupation', 'relationship', 'race', 'sex',
          'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
          'income']
df.columns = columns  # Set column names

# Preprocessing
# Mengubah kolom non-numerik menjadi numerik (dummy variables)
df = pd.get_dummies(df, columns=['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])

# Menghapus kolom yang tidak diperlukan
df.drop(['fnlwgt'], axis=1, inplace=True)

# Feature selection (contoh: menggunakan korelasi)
selected_features = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# Checking if 'income' column exists after preprocessing
if 'income' in df.columns:
    X = df[selected_features]
    y = df['income']

    # Eksperimen dengan test_size yang berbeda
    best_result = None
    test_size = 0.3
    # Pembagian dataset menjadi train dan test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Inisialisasi dan pelatihan model Naive Bayes
    model = GaussianNB()
    model.fit(X_train, y_train)

    # Prediksi pada data uji
    y_pred = model.predict(X_test)

    # Menghitung akurasi
    accuracy = accuracy_score(y_test, y_pred)

    # Menghitung error
    error = 1 - accuracy

    # Menyimpan hasil
    best_result = (test_size, accuracy, error)

    # Menampilkan hasil eksperimen
    print("Test_size, Akurasi, Error")
    print("{}, {:.2f}%, {:.2f}".format(best_result[0], best_result[1] * 100, best_result[2]))
else:
    print("Column 'income' is not present in the DataFrame after preprocessing. Check preprocessing steps.")


Test_size, Akurasi, Error
0.3, 80.01%, 0.20
