In [1]:
import os
import PIL
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
import librosa
import librosa.display

IMAGE_TRAIN_PATH = "./data/train/image"
VOICE_TRAIN_PATH = "./data/train/voice"
IMAGE_TEST_PATH = "./data/test/image/"
VOICE_TEST_PATH = "./data/test/voice/"

def read_image(path):
    return PIL.Image.open(path).convert('L')
def extract_image_features(image_folder):
    image_vectors = []
    indices = []
    for image_name in os.listdir(image_folder):
        index = os.path.splitext(image_name)[0]
        indices.append(index)
        image = read_image(os.path.join(image_folder, image_name))
        image_vector = np.array(image).reshape(-1)
        image_vectors.append(image_vector)
    return pd.DataFrame({
        'index': indices,
        'image_vector': image_vectors,
    })
def extract_voice_features(voice_folder, numcep=13, hop_length=256):
    voice_vectors = []
    indices = []
    for record_name in os.listdir(voice_folder):
        index = os.path.splitext(record_name)[0]
        indices.append(index)
        signal, sr = load_wav(
            os.path.join(voice_folder, record_name)
        )
        mfcc_features = librosa.feature.mfcc(
            signal, sr=sr, n_mfcc=numcep, hop_length=hop_length
        )
        voice_vector = mfcc_features.mean(axis=1)
        voice_vectors.append(voice_vector)
    return pd.DataFrame({
        'index': indices,
        'voice_vector': voice_vectors,
    })


def load_wav(path):
    return librosa.load(path, sr=8000)

In [None]:
# !unzip ./data/test.zip -d ./data/test
# !unzip ./data/train.zip -d ./data/train

In [2]:
voice_features = extract_voice_features(VOICE_TRAIN_PATH)
image_features = extract_image_features(IMAGE_TRAIN_PATH)
dataset = image_features.merge(voice_features, on='index')
dataset["target"] = dataset["index"].apply(lambda x: int(x[0]))

submit_image_features = extract_image_features(IMAGE_TEST_PATH)
submit_voice_features = extract_voice_features(VOICE_TEST_PATH)
submit_dataset = submit_image_features.merge(
    submit_voice_features, 
    on="index"
)
submit_dataset["index"] = submit_dataset["index"].apply(int)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [4]:
train_data, test_data = train_test_split(
    dataset, test_size=0.2, random_state=42
)

In [5]:
X =  np.hstack((
    np.vstack(dataset["voice_vector"]),
    np.vstack(dataset["image_vector"])
))
Y=  y_test = dataset["target"]

In [6]:
X_train = np.hstack((
    np.vstack(train_data["voice_vector"]),
    np.vstack(train_data["image_vector"])
))
y_train = train_data["target"]
X_test = np.hstack((
    np.vstack(test_data["voice_vector"]),
    np.vstack(test_data["image_vector"])
))
y_test = test_data["target"]

In [7]:
mdl = GradientBoostingClassifier(n_estimators=150,learning_rate = 0.15,random_state = 123).fit(X_train, y_train)
print(mdl.score(X_train, y_train), mdl.score(X_test, y_test))

1.0 0.96


In [8]:
Z= np.hstack((
    np.vstack(submit_dataset["voice_vector"]),
    np.vstack(submit_dataset["image_vector"])
))

In [9]:
mdl = GradientBoostingClassifier(n_estimators=150,learning_rate = 0.15,random_state = 123).fit(X, Y)
pred_mdl =mdl.predict(Z)

In [10]:
submit_dataset["target"] =pred_mdl

In [11]:
submit_dataset[["index", "target"]] \
    .sort_values("index") \
    .to_csv("hw.csv", index=False)