In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, roc_curve,precision_recall_curve,precision_score,recall_score,f1_score

In [3]:

from imblearn.over_sampling import ADASYN
from imblearn.combine import SMOTETomek

# Đọc dữ liệu đầu vào

In [4]:
data_path = '../../dataset'

In [5]:
with open(f'{data_path}/train.json') as f:
    data= json.load(f)

## Dữ liệu test

In [6]:
with open(f'{data_path}/test.json') as f:
    test_data= json.load(f)

In [7]:
data= pd.DataFrame(data)
test_data= pd.DataFrame(test_data)

In [8]:
data.info()
EDA_data=data.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1195 entries, 0 to 1194
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   audio_embedding                  1195 non-null   object
 1   is_turkey                        1195 non-null   int64 
 2   vid_id                           1195 non-null   object
 3   end_time_seconds_youtube_clip    1195 non-null   int64 
 4   start_time_seconds_youtube_clip  1195 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 46.8+ KB


In [9]:
EDA_data['period']=EDA_data['end_time_seconds_youtube_clip']-EDA_data['start_time_seconds_youtube_clip']

In [10]:
EDA_data['is_turkey'].value_counts()

is_turkey
0    704
1    491
Name: count, dtype: int64

# Tiền xử lý dữ liệu

In [11]:
trimed_data=data.copy()

In [12]:
trimed_data['frames']=trimed_data['audio_embedding'].str.len()

In [None]:
def trim_audio_embedding(row, only_is_turkey, min_length, start_trim, end_trim):
    if row['frames'] > min_length and (not only_is_turkey or row['is_turkey'] == 1):
        trimmed = row['audio_embedding'][start_trim:len(row['audio_embedding'])-end_trim]
        return trimmed
    return row['audio_embedding']

# Example usage: only trim rows where is_turkey==1 and frames > 5, trim from second 1 to second l-1

start_trim=1
end_trim=1 
min_length=5
only_is_turkey=True

trimed_data['audio_embedding'] = trimed_data.apply(
    lambda row: trim_audio_embedding(row, only_is_turkey, min_length, start_trim, end_trim), axis=1
)

Cắt bỏ giây đầu và giây cuối của video có độ dài lớn hơn 5

### Tách frames

In [14]:
def expand_audio_embeddings(data):
    expanded_rows = []
    for idx, row in data.iterrows():
        embeddings = row['audio_embedding']
        for emb in embeddings:
            new_row = row.copy()
            new_row['audio_embedding'] = emb
            expanded_rows.append(new_row)
    expanded_data = pd.DataFrame(expanded_rows)
    expanded_data.reset_index(drop=True, inplace=True)
    return expanded_data


In [15]:
train_data=expand_audio_embeddings(trimed_data)

In [16]:
train_data.shape

(10821, 6)

In [17]:
train_data.head(12)

Unnamed: 0,audio_embedding,is_turkey,vid_id,end_time_seconds_youtube_clip,start_time_seconds_youtube_clip,frames
0,"[172, 34, 216, 110, 208, 46, 95, 66, 161, 125,...",0,kDCk3hLIVXo,70,60,10
1,"[171, 39, 199, 121, 238, 62, 59, 61, 170, 146,...",0,kDCk3hLIVXo,70,60,10
2,"[169, 33, 200, 97, 210, 22, 73, 51, 169, 129, ...",0,kDCk3hLIVXo,70,60,10
3,"[180, 39, 218, 118, 213, 73, 80, 43, 160, 147,...",0,kDCk3hLIVXo,70,60,10
4,"[166, 31, 204, 134, 211, 59, 62, 73, 187, 167,...",0,kDCk3hLIVXo,70,60,10
5,"[160, 20, 186, 121, 211, 43, 44, 91, 235, 161,...",0,kDCk3hLIVXo,70,60,10
6,"[160, 16, 194, 88, 208, 65, 95, 80, 211, 129, ...",0,kDCk3hLIVXo,70,60,10
7,"[164, 27, 201, 96, 215, 51, 89, 71, 189, 135, ...",0,kDCk3hLIVXo,70,60,10
8,"[159, 21, 178, 86, 204, 62, 69, 48, 210, 168, ...",0,kDCk3hLIVXo,70,60,10
9,"[165, 33, 188, 93, 234, 92, 107, 66, 193, 190,...",0,kDCk3hLIVXo,70,60,10


In [18]:
test_data=expand_audio_embeddings(test_data)  # No trimming for test data

In [19]:
# Chỉ chọn test_data có embedding hợp lệ
valid_idx = test_data['audio_embedding'].apply(lambda x: isinstance(x, list) and len(x) > 0)
train_X = np.vstack(train_data['audio_embedding'].values)
train_Y = train_data['is_turkey'].values
test_X = np.vstack(test_data.loc[valid_idx, 'audio_embedding'].values)

scaler = StandardScaler()
Z = scaler.fit_transform(train_X)
test_Z = scaler.transform(test_X)

train_Z, val_Z, train_Y, val_Y = train_test_split(Z, train_Y, test_size=0.3, random_state=42)

model = RandomForestClassifier(
    n_estimators=600,
    max_depth=15,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

model.fit(train_Z, train_Y)

y_pred_prob = model.predict_proba(val_Z)[:, 1]
y_pred = model.predict(val_Z)

print("AUC Score   :", roc_auc_score(val_Y, y_pred_prob))
print("Accuracy    :", accuracy_score(val_Y, y_pred))
print("Precision   :", precision_score(val_Y, y_pred))
print("Recall      :", recall_score(val_Y, y_pred))
print("F1 Score    :", f1_score(val_Y, y_pred))

test_pred_prob = model.predict_proba(test_Z)[:, 1]

test_data['is_turkey'] = np.nan
test_data.loc[valid_idx, 'is_turkey'] = test_pred_prob

test_data[['vid_id', 'is_turkey']].to_csv('result.csv', index=False)


AUC Score   : 0.9639487485160597
Accuracy    : 0.8983677240529719
Precision   : 0.89058039961941
Recall      : 0.8132059079061685
F1 Score    : 0.8501362397820164
