In [1]:
import sys
import os
sys.path.append(os.getcwd() + "/../")

In [2]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import itertools
from itertools import combinations


In [24]:
class WellSimMetric:
    """
    Рассчитывает похожесть двух скважин.

    Мы называем скважины похожими, если при построении линейных регрессий 
    они ведут себя на интервалах одинаково. Под "одинаково" я имею в виду, 
    что забойное давление при одиныковых условиях у них не отличается более,
    чем на 10%.
    """

    def __init__(self, 
                 X1_train: np.ndarray, 
                 X2_train: np.ndarray, 
                 y1_train: np.ndarray, 
                 y2_train: np.ndarray):
        """Initialization and train linear regression.

        Args:
            X1_train (np.ndarray): фичи для скважины 1
            X2_train (np.ndarray): для 2
            y1_train (np.ndarray): давление на забое для скважины 1
            y2_train (np.ndarray): для 2
        """
        self.X1_train = X1_train
        self.X2_train = X2_train
        self.y1_train = y1_train
        self.y2_train = y2_train

        # Fit linear regression models
        self.model1 = LinearRegression().fit(X1_train, y1_train)
        self.model2 = LinearRegression().fit(X2_train, y2_train)
    
    def get_error(self, y1_pred: np.ndarray, y2_pred: np.ndarray) -> np.ndarray:
        """Подсчитывает ошибку между двумя предсказаниями

        Args:
            y1_pred (np.ndarray): предсказанное давление на забое для скважины 1
            y2_pred (np.ndarray): для 2
        Returns:
            (np.ndarray): 0 или 1 - похожи скважины или нет
        """
        return np.abs(y1_pred - y2_pred) / ((y1_pred + y2_pred) / 2)

    def get_sim(self):
        X = np.concatenate([self.X1_train, self.X2_train])

        # Predict target variable for both wells
        y1_pred = self.model1.predict(X)
        y2_pred = self.model2.predict(X)

        # Calculate similarity
        similarity = (self.get_error(y1_pred, y2_pred) <= 0.1).astype(int)

        return similarity

In [29]:
df = pd.read_csv("../data/cleaned/data.csv", index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 180 entries, 0 to 179
Data columns (total 22 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Обводненность_для_расчета_PVT_параметров              180 non-null    float64
 1   Глубина_по_стволу_cs                                  180 non-null    float64
 2   Внутренний_диаметр_эксплуатационной_колонны_cs        180 non-null    float64
 3   Внутренняя_шероховатость_эксплуатационной_колонны_cs  180 non-null    float64
 4   Глубина_по_стволу                                     180 non-null    float64
 5   Внутренний_диаметр_НКТ                                180 non-null    float64
 6   Внутренняя_шероховатость_НКТ                          180 non-null    float64
 7   Внешний_диаметр_НКТ                                   180 non-null    float64
 8   Внутренний_диаметр_эксплуатационной_колонны           180 non-nul

In [26]:
df.corr().abs().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Обводненность_для_расчета_PVT_параметров,Глубина_по_стволу_cs,Внутренний_диаметр_эксплуатационной_колонны_cs,Внутренняя_шероховатость_эксплуатационной_колонны_cs,Глубина_по_стволу,Внутренний_диаметр_НКТ,Внутренняя_шероховатость_НКТ,Внешний_диаметр_НКТ,Внутренний_диаметр_эксплуатационной_колонны,FILTERED_MD_LAST_VALUE,FILTERED_TVD_LAST_VALUE,Газовый_фактор,Коэффициент_продуктиности,Глубина_спуска_ЭЦН_по_стволу_MD,Частота_ЭЦН,Количество_ступеней,Коэффициент_износа,Содержание_растворенного_газа,Плотность_нефти,Удельный_вес_газа,Корреляция_вязкости_нефти,Пластовое_давление
Обводненность_для_расчета_PVT_параметров,1.0,0.069288,0.215585,0.221408,0.16245,0.171651,0.083619,0.228179,0.048906,0.043712,0.086348,0.207163,0.076008,0.253699,0.22874,0.215207,0.113312,0.080067,0.091638,0.151553,0.137453,0.173392
Глубина_по_стволу_cs,0.069288,1.0,0.147945,0.03799,0.66485,0.481459,0.010674,0.508891,0.309616,0.374321,0.109411,0.149736,0.021812,0.368636,0.311839,0.31964,0.031813,0.053956,0.061299,0.109803,0.264291,0.010498
Внутренний_диаметр_эксплуатационной_колонны_cs,0.215585,0.147945,1.0,0.297272,0.114948,0.030786,0.11819,0.026511,0.554616,0.127245,0.07561,0.106524,0.019079,0.281994,0.255076,0.272264,0.060398,0.096513,0.009134,0.022744,0.03543,0.063037
Внутренняя_шероховатость_эксплуатационной_колонны_cs,0.221408,0.03799,0.297272,1.0,0.078296,0.165349,0.38388,0.179391,0.148703,0.12032,0.288188,0.17517,0.059616,0.011567,0.01674,0.034118,0.044155,0.110607,0.066677,0.08601,0.110222,0.032932
Глубина_по_стволу,0.16245,0.66485,0.114948,0.078296,1.0,0.63445,0.014636,0.681883,0.292987,0.201127,0.070253,0.194575,0.000393,0.41289,0.314139,0.349195,0.054222,0.033641,0.115874,0.152043,0.21542,0.123064
Внутренний_диаметр_НКТ,0.171651,0.481459,0.030786,0.165349,0.63445,1.0,0.029192,0.94171,0.11039,0.128562,0.006147,0.120132,0.052294,0.409359,0.416286,0.411154,0.136703,0.075255,0.169144,0.308202,0.288326,0.129684
Внутренняя_шероховатость_НКТ,0.083619,0.010674,0.11819,0.38388,0.014636,0.029192,1.0,0.04736,0.056301,0.136455,0.121718,0.110562,0.015847,0.050843,0.055202,0.000121,0.004933,0.039591,0.035457,0.084248,0.105263,0.086537
Внешний_диаметр_НКТ,0.228179,0.508891,0.026511,0.179391,0.681883,0.94171,0.04736,1.0,0.122997,0.102295,0.098955,0.152094,0.057377,0.375049,0.382364,0.377257,0.126607,0.073914,0.160964,0.292732,0.272553,0.090944
Внутренний_диаметр_эксплуатационной_колонны,0.048906,0.309616,0.554616,0.148703,0.292987,0.11039,0.056301,0.122997,1.0,0.092307,0.015262,0.011025,0.009718,0.121552,0.089868,0.116762,0.078023,0.042122,0.036382,0.025972,0.066626,0.026398
FILTERED_MD_LAST_VALUE,0.043712,0.374321,0.127245,0.12032,0.201127,0.128562,0.136455,0.102295,0.092307,1.0,0.201412,0.058547,0.072268,0.134753,0.042354,0.060902,0.155578,0.073092,0.017825,0.275204,0.278534,0.20057


In [27]:
# Создание всех возможных пар объектов для каждого запроса
pairs = []
for query, group in df.groupby("Query"):
    objects = group["Object1"].tolist() + group["Object2"].tolist()
    object_pairs = list(itertools.combinations(objects, 2))
    pairs.extend([(query, obj1, obj2) for obj1, obj2 in object_pairs])

# Создание DataFrame для обучающего набора данных
train_data = pd.DataFrame(pairs, columns=["Query", "Object1", "Object2"])

# Объединение данных с исходной таблицей для получения меток сходства
train_data = train_data.merge(df, on=["Query", "Object1", "Object2"], how="left")

# Вывод первых нескольких строк обучающего набора данных
print(train_data.head())


KeyError: 'Query'

In [34]:
# Создание всех возможных пар объектов для каждого запроса
pairs = []
for i, row in df.iterrows():
    features = row.values
    pairs.extend(list(combinations(features, 2)))
    print(pairs)
    break

[(1.200000048, 3026.0), (1.200000048, 0.113), (1.200000048, 0.0001524), (1.200000048, 2851.75), (1.200000048, 0.062), (1.200000048, 0.0001524), (1.200000048, 0.073), (1.200000048, 0.1598), (1.200000048, 3026.0), (1.200000048, 3025.8), (1.200000048, 187.626420312), (1.200000048, 5.967998276), (1.200000048, 2851.749876563), (1.200000048, 60.0), (1.200000048, 354.0), (1.200000048, 0.899999976), (1.200000048, 143.899994995), (1.200000048, 867.399978143), (1.200000048, 0.879400015), (1.200000048, 1.0), (1.200000048, 300.103132602), (3026.0, 0.113), (3026.0, 0.0001524), (3026.0, 2851.75), (3026.0, 0.062), (3026.0, 0.0001524), (3026.0, 0.073), (3026.0, 0.1598), (3026.0, 3026.0), (3026.0, 3025.8), (3026.0, 187.626420312), (3026.0, 5.967998276), (3026.0, 2851.749876563), (3026.0, 60.0), (3026.0, 354.0), (3026.0, 0.899999976), (3026.0, 143.899994995), (3026.0, 867.399978143), (3026.0, 0.879400015), (3026.0, 1.0), (3026.0, 300.103132602), (0.113, 0.0001524), (0.113, 2851.75), (0.113, 0.062), (0.1

In [32]:
pairs

[(1.200000048, 3026.0),
 (1.200000048, 0.113),
 (1.200000048, 0.0001524),
 (1.200000048, 2851.75),
 (1.200000048, 0.062),
 (1.200000048, 0.0001524),
 (1.200000048, 0.073),
 (1.200000048, 0.1598),
 (1.200000048, 3026.0),
 (1.200000048, 3025.8),
 (1.200000048, 187.626420312),
 (1.200000048, 5.967998276),
 (1.200000048, 2851.749876563),
 (1.200000048, 60.0),
 (1.200000048, 354.0),
 (1.200000048, 0.899999976),
 (1.200000048, 143.899994995),
 (1.200000048, 867.399978143),
 (1.200000048, 0.879400015),
 (1.200000048, 1.0),
 (1.200000048, 300.103132602),
 (3026.0, 0.113),
 (3026.0, 0.0001524),
 (3026.0, 2851.75),
 (3026.0, 0.062),
 (3026.0, 0.0001524),
 (3026.0, 0.073),
 (3026.0, 0.1598),
 (3026.0, 3026.0),
 (3026.0, 3025.8),
 (3026.0, 187.626420312),
 (3026.0, 5.967998276),
 (3026.0, 2851.749876563),
 (3026.0, 60.0),
 (3026.0, 354.0),
 (3026.0, 0.899999976),
 (3026.0, 143.899994995),
 (3026.0, 867.399978143),
 (3026.0, 0.879400015),
 (3026.0, 1.0),
 (3026.0, 300.103132602),
 (0.113, 0.0001524