In [38]:
pack_identify = {
    "lips": [13, 14, 78, 191, 17, 84, 87, 88],        # Полные / тонкие губы
    "mouth": [61, 291, 13, 14],                      # Большой / маленький рот
    "nose": [1, 6, 168, 97, 326]                     # Длинный/широкий / короткий/узкий нос
}


In [39]:
import cv2
import pandas as pd
import numpy as np
import mediapipe as mp
import os
import re

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import joblib

# Пути до ресурсов
DATASET_BINARY_DATA = "../binary_data/"
DATASET_ALL_DATA = "../all_data/"

# Загружаем MediaPipe FaceMesh
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1)

# Используемые индексы для признаков (губы, рот, нос)
pack_identify = {
    "lips": [13, 14, 78, 191, 17, 84, 87, 88],       # Полные / тонкие губы
    "mouth": [61, 291, 13, 14],                     # Большой / маленький рот
    "nose": [1, 6, 168, 97, 326]                    # Длинный/широкий / короткий/узкий нос
}

# Общие индексы
all_indices = sorted(set(pack_identify["lips"] + pack_identify["mouth"] + pack_identify["nose"]))

# Названия колонок
columns = []
for idx in all_indices:
    columns.append(f"x{idx}")
    columns.append(f"y{idx}")

def numeric_key(name):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', name)]

# Нормализация координат по центру между глазами
def normalize_landmarks(points):
    points = points.astype(np.float32)
    left_eye = points[33]  # Пример: медиапайп левый глаз (в районе 33)
    right_eye = points[263]  # Правый глаз
    center_x = (left_eye[0] + right_eye[0]) / 2
    center_y = (left_eye[1] + right_eye[1]) / 2
    points[:, 0] -= center_x
    points[:, 1] -= center_y
    eye_dist = np.linalg.norm(left_eye - right_eye)
    if eye_dist > 0:
        points /= eye_dist
    return points

# Извлечение признаков из изображения
def get_landmarks(image_path):
    image_array = np.fromfile(image_path, dtype=np.uint8)
    image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    if image is None:
        raise ValueError("Изображение не загружено!")
            
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(image_rgb)

    if not results.multi_face_landmarks:
        return None

    shape = results.multi_face_landmarks[0]
    h, w, _ = image.shape
    points = np.array([[p.x * w, p.y * h] for p in shape.landmark])

    # Нормализация и извлечение только нужных индексов
    points = normalize_landmarks(points)
    selected = points[all_indices]
    return selected.flatten()

# Сборка датасета
def build_dataframe(dataset_dir, all_landmarks, all_labels, get_label_func=lambda k: k % 8):
    k = 0
    for label_dir in sorted(os.listdir(dataset_dir), key=numeric_key):
        label_path = os.path.join(dataset_dir, label_dir)
        if not os.path.isdir(label_path):
            continue
        
        label = get_label_func(k)
        for filename in os.listdir(label_path):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(label_path, filename)
                landmarks = get_landmarks(image_path)
                if landmarks is None:
                    continue
                all_landmarks.append(landmarks)
                all_labels.append(label)
        k += 1


In [40]:
all_landmarks = []
all_labels = []

build_dataframe(DATASET_ALL_DATA, all_landmarks, all_labels) # получаем числовой код 

all_landmarks_1 = []
all_labels_1 = []

build_dataframe(DATASET_BINARY_DATA, all_landmarks_1, all_labels_1) # получаем числовой код

In [41]:
# Список всех используемых индексов (для губ, рта, носа)
all_indices = sorted(set(pack_identify["lips"] + pack_identify["mouth"] + pack_identify["nose"]))

# Формируем имена колонок под x и y координаты
columns = []
for idx in all_indices:
    columns.append(f"x{idx}")
    columns.append(f"y{idx}")

# DataFrame для всех данных
df_all = pd.DataFrame(all_landmarks, columns=columns)
df_all['label'] = all_labels
df_all = df_all.sample(frac=1).reset_index(drop=True)

# DataFrame для бинарных данных (если используешь)
df_binary = pd.DataFrame(all_landmarks_1, columns=columns)
df_binary['label'] = all_labels_1
df_binary = df_binary.sample(frac=1).reset_index(drop=True)

# Просмотр результатов
df_all, df_binary


(            x1        y1        x6        y6       x13       y13       x14  \
 0    -0.056275  0.372069 -0.035465  0.012923 -0.032392  0.682000 -0.030060   
 1    -0.238669  0.454142 -0.141827  0.058870 -0.150730  0.767197 -0.154339   
 2     0.016403  0.461038  0.011001  0.032303  0.008174  0.789515  0.011363   
 3     0.141761  0.460864  0.084964  0.035218  0.093990  0.756574  0.095902   
 4     0.200512  0.547385  0.140489  0.113695  0.067602  0.744201  0.071207   
 ...        ...       ...       ...       ...       ...       ...       ...   
 1861 -0.490961  0.621132 -0.294170  0.093957 -0.280231  1.006251 -0.286346   
 1862 -0.038136  0.505671 -0.022198  0.077738 -0.029449  0.763289 -0.032455   
 1863  0.070054  0.406436  0.043119  0.016765  0.031825  0.692666  0.032946   
 1864 -0.177963  0.382699 -0.115371  0.024289 -0.095928  0.691930 -0.096063   
 1865  0.204013  0.543701  0.129676  0.073789  0.128124  0.846799  0.132962   
 
            y14       x17       y17  ...       y97

In [42]:
df_all

Unnamed: 0,x1,y1,x6,y6,x13,y13,x14,y14,x17,y17,...,y97,x168,y168,x191,y191,x291,y291,x326,y326,label
0,-0.056275,0.372069,-0.035465,0.012923,-0.032392,0.682000,-0.030060,0.688608,-0.031166,0.786063,...,0.447190,-0.030602,-0.049638,-0.254360,0.668893,0.297768,0.684216,0.046283,0.449374,0
1,-0.238669,0.454142,-0.141827,0.058870,-0.150730,0.767197,-0.154339,0.806276,-0.160055,0.929958,...,0.516141,-0.116331,-0.028348,-0.272347,0.743679,0.181437,0.808298,-0.087865,0.532349,0
2,0.016403,0.461038,0.011001,0.032303,0.008174,0.789515,0.011363,0.820005,0.011474,0.926612,...,0.539416,0.009672,-0.063509,-0.196937,0.782021,0.271838,0.796591,0.082711,0.540942,5
3,0.141761,0.460864,0.084964,0.035218,0.093990,0.756574,0.095902,0.763287,0.097568,0.857378,...,0.538906,0.069455,-0.059038,-0.127593,0.764854,0.303426,0.764973,0.173665,0.533908,5
4,0.200512,0.547385,0.140489,0.113695,0.067602,0.744201,0.071207,0.749891,0.063551,0.839906,...,0.578449,0.123378,-0.001713,-0.182617,0.713943,0.239714,0.685574,0.189466,0.570108,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1861,-0.490961,0.621132,-0.294170,0.093957,-0.280231,1.006251,-0.286346,1.031114,-0.291464,1.176857,...,0.699069,-0.244662,-0.042183,-0.346268,0.965687,0.152077,1.033642,-0.247789,0.724630,5
1862,-0.038136,0.505671,-0.022198,0.077738,-0.029449,0.763289,-0.032455,0.760988,-0.034484,0.846548,...,0.558909,-0.017975,-0.019580,-0.248018,0.735478,0.277650,0.750307,0.050024,0.562913,3
1863,0.070054,0.406436,0.043119,0.016765,0.031825,0.692666,0.032946,0.691850,0.034672,0.781316,...,0.471151,0.035515,-0.068068,-0.239840,0.657991,0.341032,0.677578,0.142759,0.474835,3
1864,-0.177963,0.382699,-0.115371,0.024289,-0.095928,0.691930,-0.096063,0.713766,-0.094313,0.822782,...,0.459937,-0.098081,-0.062837,-0.234070,0.678502,0.227120,0.704925,-0.045443,0.465418,4


In [43]:
print(df_all["label"].unique())

print(df_binary["label"].unique())

[0 5 3 4 2 1 6 7]
[1 0]


In [44]:
print(df_all["label"].value_counts())

print(df_binary["label"].value_counts())

label
0    648
3    487
5    428
4    288
2      5
6      5
1      3
7      2
Name: count, dtype: int64
label
1    1204
0     661
Name: count, dtype: int64


In [45]:
# Разделим данные
X_train_subtype, X_test_subtype, y_train_subtype, y_test_subtype = train_test_split(
    df_all.drop(columns=['label']),  # все колонки кроме 'label'
    df_all['label'],                 # сами метки
    test_size=0.2,                # 20% на тест
    #stratify=df['label'],         # сбалансированная разбивка по классам
    random_state=42
)

X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    df_binary.drop(columns=['label']),  # все колонки кроме 'label'
    df_binary['label'],                 # сами метки
    test_size=0.2,                # 20% на тест
    stratify=df_binary['label'],         # сбалансированная разбивка по классам
    random_state=42
)

# Обучим модель
model_subtype = LogisticRegression(max_iter=1000)
model_subtype.fit(X_train_subtype, y_train_subtype)

model_binary = LogisticRegression(max_iter=1000)
model_binary.fit(X_train_binary, y_train_binary)

# Оценим качество
y_pred_subtype = model_subtype.predict(X_test_subtype)
print(f"Accuracy subtype: {accuracy_score(y_test_subtype, y_pred_subtype):.4f}")

y_pred_binary = model_binary.predict(X_test_binary)
print(f"Accuracy binary class: {accuracy_score(y_test_binary, y_pred_binary):.4f}")

Accuracy subtype: 0.4439
Accuracy binary class: 0.6542


In [46]:
def predict_dichotomy(image_path, model):
    landmarks = get_landmarks(image_path)
    
    X_input = pd.DataFrame([landmarks], columns=columns)

    prediction = model.predict(X_input)[0]
    proba = model.predict_proba(X_input)[0]

    return prediction, proba

In [47]:
joblib.dump(model_subtype, 'subtype_classifier.pkl')

joblib.dump(model_binary, 'logic_ethics_classifier.pkl')

subtypes = {
    0 : "Шизоидный",
    1 : "Параноидальный",
    2 : "Нарциссический",
    3 : "Психопатический",
    4 : "Компульсивный",
    5 : "Истерический",
    6 : "Депрессивный",
    7 : "Мазохистический"
}

In [48]:
model_subtype_file = joblib.load("subtype_classifier.pkl")  # путь к сохранённой модели
model_binary_file = joblib.load("logic_ethics_classifier.pkl")  # путь к сохранённой модели

In [49]:
# 2. Инициализация детектора и предиктора
result_subtype, confidence_subtype = predict_dichotomy("../all_data/12/16 (2).jpg", model_subtype_file)
print(f"Подтип личности: {subtypes[result_subtype]} (Уверенность: {max(confidence_subtype):.2f})")

result_binary, confidence_binary = predict_dichotomy("../all_data/12/16 (2).jpg", model_binary_file)
print(f"Дихотомия: {'Экстраверсия' if result_binary == 0 else 'Интроверсия'} (Уверенность: {max(confidence_binary):.2f})")


Подтип личности: Шизоидный (Уверенность: 0.28)
Дихотомия: Интроверсия (Уверенность: 0.70)
