In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os
import PIL

IMAGE_TRAIN_PATH = "./train/image"
VOICE_TRAIN_PATH = "./train/voice"
IMAGE_TEST_PATH = "./test/image/"
VOICE_TEST_PATH = "./test/voice/"

def read_image(path):
    return PIL.Image.open(path).convert('L')

In [3]:
# делаю таблицу
import pandas as pd

In [4]:
# картнки
def extract_image_features(image_folder):
    image_vectors = []
    indices = []
    for image_name in os.listdir(image_folder):
        index = os.path.splitext(image_name)[0]
        indices.append(index)
        image = read_image(os.path.join(image_folder, image_name))
        image_vector = np.array(image).reshape(-1)
        image_vectors.append(image_vector)
    return pd.DataFrame({
        'index': indices,
        'image_vector': image_vectors,
    })

In [5]:
image_features = extract_image_features(IMAGE_TRAIN_PATH)
image_features["target"] = image_features["index"].apply(lambda x: int(x[0]))
image_features = image_features.sort_values(by='target')

In [6]:
# голос
import librosa
def load_wav(path):
    return librosa.load(path, sr=8000)

def extract_voice_features(voice_folder, numcep=13, hop_length=256):
    voice_vectors = []
    indices = []
    for record_name in os.listdir(voice_folder):
        index = os.path.splitext(record_name)[0]
        indices.append(index)
        signal, sr = load_wav(
            os.path.join(voice_folder, record_name)
        )
        mfcc_features = librosa.feature.mfcc(
            signal, sr=sr, n_mfcc=numcep, hop_length=hop_length
        )
        voice_vector = mfcc_features.mean(axis=1)
        voice_vectors.append(voice_vector)
    return pd.DataFrame({
        'index': indices,
        'voice_vector': voice_vectors,
    })

In [7]:
voice_features = extract_voice_features(VOICE_TRAIN_PATH)
voice_features["target"] = voice_features['index'].apply(lambda x: int(x[0]))
voice_features = voice_features.sort_values(by='target')

In [8]:
image_features.head()

Unnamed: 0,index,image_vector,target
499,0_51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
540,0_117,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
802,0_98,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
959,0_74,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1256,0_12,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [9]:
voice_features.head()

Unnamed: 0,index,voice_vector,target
361,0_69,"[-257.0925, 75.63635, -7.967024, -13.043361, -...",0
486,0_25,"[-343.9921, 73.030975, -19.955416, -7.9696946,...",0
484,0_31,"[-210.23732, 63.847366, 39.9681, 1.6842186, 1....",0
168,0_84,"[-190.51178, 60.189014, 30.737442, -4.575465, ...",0
169,0_90,"[-182.91739, 101.5228, 8.934057, 17.446121, -2...",0


In [10]:
dataset = pd.merge(image_features, voice_features, on='target', how='outer')

In [11]:
dataset

Unnamed: 0,index_x,image_vector,target,index_y,voice_vector
0,0_51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0_69,"[-257.0925, 75.63635, -7.967024, -13.043361, -..."
1,0_51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0_25,"[-343.9921, 73.030975, -19.955416, -7.9696946,..."
2,0_51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0_31,"[-210.23732, 63.847366, 39.9681, 1.6842186, 1...."
3,0_51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0_84,"[-190.51178, 60.189014, 30.737442, -4.575465, ..."
4,0_51,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0_90,"[-182.91739, 101.5228, 8.934057, 17.446121, -2..."
...,...,...,...,...,...
225121,9_96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,9_84,"[-331.8819, 79.107414, -20.253765, -20.558043,..."
225122,9_96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,9_146,"[-194.57497, 63.1638, -11.537321, -4.760928, 2..."
225123,9_96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,9_90,"[-193.1624, 92.93113, 0.64686507, -22.801498, ..."
225124,9_96,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,9_132,"[-160.88858, 57.00657, 5.003337, -4.3809357, -..."


In [12]:
# тренировка
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [13]:
# train_data, test_data = train_test_split(
#     dataset, test_size=0.8, random_state=1
# )

In [14]:
X = np.hstack((
    np.vstack(dataset["voice_vector"]),
    np.vstack(dataset["image_vector"])
))
y = dataset['target']

In [15]:
# clf = RandomForestClassifier(n_estimators=500, max_depth = 9)

In [16]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=250, max_depth = 9)
scores = cross_val_score(
    clf, X, y, cv=5, scoring='accuracy')
print(scores)

[0.93719742 0.94583009 0.94300944 0.94971682 0.9546656 ]


In [17]:
# предсказание
X_image = extract_image_features(IMAGE_TEST_PATH)
X_voice = extract_voice_features(VOICE_TEST_PATH)
df = X_image.merge(
    X_voice, 
    on="index"
)
df["index"] = df["index"].apply(int)

In [18]:
df = df.sort_values(by='index')

In [19]:
X_test_finish = np.hstack((
    np.vstack(df["image_vector"]),
    np.vstack(df["voice_vector"])
))

In [20]:
clf = RandomForestClassifier(n_estimators=250, max_depth = 9)
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
y_pred = clf.predict(X_test_finish)

In [22]:
otvet = [['index', 'target']]
for i in range(len(y_pred)):
    otvet.append([i, y_pred[i]])

In [23]:
import csv
with open('output.csv', 'w') as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerows(otvet)