# Import thư viện

In [2]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve,precision_recall_curve,precision_score,recall_score,f1_score

In [4]:

from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek

# Đọc dữ liệu đầu vào

## Dữ liệu train

In [5]:
data_path = '../../dataset'

In [6]:
with open(f'{data_path}/train.json') as f:
    data= json.load(f)

## Dữ liệu test

In [7]:
with open(f'{data_path}/test.json') as f:
    test_data= json.load(f)

# EAC dữ liệu

In [8]:
data= pd.DataFrame(data)
test_data= pd.DataFrame(test_data)

In [9]:
data.info()
EDA_data=data.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1195 entries, 0 to 1194
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   audio_embedding                  1195 non-null   object
 1   is_turkey                        1195 non-null   int64 
 2   vid_id                           1195 non-null   object
 3   end_time_seconds_youtube_clip    1195 non-null   int64 
 4   start_time_seconds_youtube_clip  1195 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 46.8+ KB


In [10]:
EDA_data['period']=EDA_data['end_time_seconds_youtube_clip']-EDA_data['start_time_seconds_youtube_clip']

In [11]:
EDA_data['is_turkey'].value_counts()

is_turkey
0    704
1    491
Name: count, dtype: int64

Dữ liệu có một chút thiên lệch

# Tiền xử lý dữ liệu

Dữ liệu đang bị thiên lệch, thử SMOTE

In [12]:
train_data, valid_data= train_test_split(data, test_size=0.2, random_state=42)

X_train= train_data.drop(columns=['is_turkey'])
y_train= train_data['is_turkey']


In [13]:


X_train_mean = np.stack(X_train['audio_embedding'].apply(lambda x: np.mean(x, axis=0)))

adasyn_sampler = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn_sampler.fit_resample(X_train_mean, y_train)


# smote_tomek_sampler = SMOTETomek(random_state=42)
# X_train_resampled, y_train_resampled = smote_tomek_sampler.fit_resample(X_train_mean, y_train)

print(f"Kích thước X_train_mean ban đầu: {X_train_mean.shape}")
print(f"Kích thước y_train ban đầu: {y_train.shape}")
print(f"Kích thước X_train_resampled sau ADASYN: {X_train_resampled.shape}")
print(f"Kích thước y_train_resampled sau ADASYN: {y_train_resampled.shape}")
print("\nSố lượng mẫu mỗi lớp sau ADASYN:")
print(y_train_resampled.value_counts())

Kích thước X_train_mean ban đầu: (956, 128)
Kích thước y_train ban đầu: (956,)
Kích thước X_train_resampled sau ADASYN: (1171, 128)
Kích thước y_train_resampled sau ADASYN: (1171,)

Số lượng mẫu mỗi lớp sau ADASYN:
is_turkey
1    590
0    581
Name: count, dtype: int64


In [14]:
y_train.value_counts()

is_turkey
0    581
1    375
Name: count, dtype: int64

In [15]:
y_train_resampled.value_counts()

is_turkey
1    590
0    581
Name: count, dtype: int64

SMOTE thành công

In [16]:
X_train_resampled = pd.DataFrame(X_train_resampled)
y_train_resampled = pd.DataFrame(y_train_resampled)

# Huấn luyện mô hình

In [20]:
valid_idx_test = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
test_X = np.stack(test_data.loc[valid_idx_test, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0)))


valid_idx_val = valid_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
val_X = np.stack(valid_data.loc[valid_idx_val, 'audio_embedding'].apply(lambda x: np.mean(x, axis=0))) 

scaler = StandardScaler()
Z = scaler.fit_transform(X_train_resampled)
train_Z = Z 


val_Z = scaler.transform(val_X) 
test_Z = scaler.transform(test_X) 


train_Y = y_train_resampled 
val_Y = valid_data.loc[valid_idx_val, 'is_turkey'] 


model = RandomForestClassifier(
    n_estimators=600,
    max_depth=25,
    min_samples_split=3,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

model.fit(train_Z, train_Y)


print("--- Đánh giá trên tập Validation ---")
y_pred_prob = model.predict_proba(val_Z)[:, 1]
y_pred = model.predict(val_Z)

print("AUC Score   :", roc_auc_score(val_Y, y_pred_prob))
print("Accuracy    :", accuracy_score(val_Y, y_pred))
print("Precision   :", precision_score(val_Y, y_pred))
print("Recall      :", recall_score(val_Y, y_pred))
print("F1 Score    :", f1_score(val_Y, y_pred))

test_pred_prob = model.predict_proba(test_Z)[:, 1]


test_data['is_turkey'] = np.nan
test_data.loc[valid_idx_test, 'is_turkey'] = test_pred_prob
test_data[['vid_id', 'is_turkey']].to_csv('result.csv', index=False)


  return fit_method(estimator, *args, **kwargs)


--- Đánh giá trên tập Validation ---
AUC Score   : 0.9821979254275301
Accuracy    : 0.9205020920502092
Precision   : 0.944954128440367
Recall      : 0.8879310344827587
F1 Score    : 0.9155555555555556
