In [1]:
import numpy as np
import pandas as pd

train_data = pd.read_csv("train_set.csv", index_col=None)
test_data = pd.read_csv("test_set.csv", index_col=None)

X_train = train_data.drop(columns="truth")
y_train = train_data[['truth']]

X_test = test_data.drop(columns="truth")
y_test = test_data[['truth']]

X_train.drop(columns=['Unnamed: 0', 'enroll_id'], inplace=True)
X_test.drop(columns=['Unnamed: 0', 'enroll_id'], inplace=True)

In [2]:
from sklearn.preprocessing import LabelEncoder, StandardScaler


# Xác định các cột phân loại và cột số
categorical_columns = ['gender', 'education']  # Các cột phân loại
numerical_columns = [col for col in X_train.columns if col not in categorical_columns]  # Các cột số

# Mã hóa các cột phân loại
for col in categorical_columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])

for col in categorical_columns:
    le = LabelEncoder()
    X_test[col] = le.fit_transform(X_test[col])

# Chuẩn hóa các cột số
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.fit_transform(X_test[numerical_columns])

In [3]:
X_train['education'].unique()

array([0, 4, 1, 2, 3, 5, 6], dtype=int64)

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate
from tensorflow.keras.models import Model

# Kích thước embedding cho các đặc trưng phân loại
embedding_size_gender = 2  
embedding_size_education = 7 

# Đầu vào cho các đặc trưng phân loại
input_gender = Input(shape=(1,), name='gender_input')
input_education = Input(shape=(1,), name='education_input')

# Embedding layers cho các đặc trưng phân loại
embedding_gender = Embedding(input_dim=2, output_dim=embedding_size_gender)(input_gender)
embedding_education = Embedding(input_dim=7, output_dim=embedding_size_education)(input_education)

# Làm phẳng các vector nhúng
flat_gender = Flatten()(embedding_gender)
flat_education = Flatten()(embedding_education)

# Đầu vào cho các đặc trưng số
input_numerical = Input(shape=(len(numerical_columns),), name='numerical_input')

# Kết hợp tất cả các đầu vào
combined = Concatenate()([flat_gender, flat_education, input_numerical])

# Xây dựng DNN
x = Dense(64, activation='relu')(combined)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)  # Sigmoid cho bài toán nhị phân

# Tạo mô hình
model = Model(inputs=[input_gender, input_education, input_numerical], outputs=output)

# Compile mô hình
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# In cấu trúc mô hình
model.summary()

In [5]:
X_train[numerical_columns]

Unnamed: 0,age,prev_dropout_num,previous_enroll_num,previous_dropout_user_num,duration,session_num,video_num,courseware_num
0,1.064198,-0.386941,-0.960571,-1.012355,0.162092,-0.484773,-0.124202,-0.440631
1,0.888556,-0.386941,-0.971948,-1.019790,-0.966892,-0.484773,-0.160236,-0.416798
2,0.712914,-0.386941,-0.973844,-1.019790,-0.134035,-0.484773,-0.128902,-0.392965
3,-0.516583,-0.386941,-0.983326,-1.032181,-0.134035,-0.484773,-0.058401,-0.428714
4,-0.516583,-0.386941,-0.970052,-1.022268,0.162092,-0.484773,-0.161803,-0.440631
...,...,...,...,...,...,...,...,...
157938,1.415483,0.195211,-0.283611,-0.211897,-0.892860,1.801034,1.304617,0.524605
157939,5.630899,3.979200,-0.287403,-0.211897,-0.892860,3.705873,0.410038,1.549423
157940,0.537271,-0.386941,-0.268441,-0.199506,-0.892860,2.562970,1.287384,5.887026
157941,0.361629,-0.386941,-0.264648,-0.199506,-0.892860,4.467809,0.408472,3.956554


In [7]:
# Huấn luyện mô hình
history = model.fit(
    [X_train['gender'], X_train['education'], X_train[numerical_columns]],  # Đầu vào
    y_train.values.ravel(),  # Nhãn
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

# Đánh giá mô hình trên tập kiểm tra


Epoch 1/20
[1m3949/3949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 898us/step - accuracy: 0.8654 - loss: 0.3369 - val_accuracy: 0.8102 - val_loss: 0.4851
Epoch 2/20
[1m3949/3949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 854us/step - accuracy: 0.8669 - loss: 0.3355 - val_accuracy: 0.8112 - val_loss: 0.4666
Epoch 3/20
[1m3949/3949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 846us/step - accuracy: 0.8661 - loss: 0.3361 - val_accuracy: 0.8088 - val_loss: 0.4973
Epoch 4/20
[1m3949/3949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 816us/step - accuracy: 0.8680 - loss: 0.3339 - val_accuracy: 0.8096 - val_loss: 0.4846
Epoch 5/20
[1m3949/3949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 803us/step - accuracy: 0.8688 - loss: 0.3312 - val_accuracy: 0.8096 - val_loss: 0.4703
Epoch 6/20
[1m3949/3949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 828us/step - accuracy: 0.8674 - loss: 0.3334 - val_accuracy: 0.8099 - val_loss: 0.4805
Epoc

In [8]:
results = model.evaluate([X_test['gender'], X_test['education'], X_test[numerical_columns]], y_test.values.ravel())
print("Test Loss, Test Accuracy:", results)

[1m2116/2116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 596us/step - accuracy: 0.8672 - loss: 0.3370
Test Loss, Test Accuracy: [0.371275395154953, 0.8506329655647278]
