In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./train_df.csv')
test = pd.read_csv('./test_df.csv')

# Исследование данных

In [2]:
train.head()

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,758,9,0,0,1,20,3,40,0,3,...,0.204682,0.271755,0.055623,0,0,0,0.38648,0.0,0.0,0
1,758,9,0,0,1,20,3,40,0,3,...,0.195531,0.188787,0.036914,0,0,0,0.10982,0.0,0.0,0
2,758,9,0,0,1,20,3,40,0,3,...,0.148609,0.186517,0.027718,0,0,0,0.03674,0.0,0.0,0
3,758,9,0,0,1,20,3,40,0,3,...,0.223748,0.229039,0.051247,0,0,0,0.0,0.0,0.0,0
4,758,9,0,0,1,20,3,40,0,3,...,0.170935,0.249031,0.042568,0,0,0,0.0,0.0,0.0,0


In [3]:
test.head()

Unnamed: 0,search_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,target
0,10655,9,0,0,1,20,4,40,0,0,...,0.14883,0.196644,0.029267,0,0,0,0.03674,0.0,0.0,0
1,10655,9,0,0,1,20,4,40,0,0,...,0.119724,0.174199,0.020856,0,0,0,0.0,0.0,0.0,0
2,10655,9,0,0,1,20,4,40,0,0,...,0.160606,0.19878,0.031925,0,0,0,0.0,0.0,0.0,0
3,10655,9,0,0,1,20,4,40,0,0,...,0.180191,0.187882,0.033855,0,0,0,0.0,0.0,0.0,0
4,10655,9,0,0,1,20,4,40,0,0,...,0.117308,0.153586,0.018017,0,0,0,0.0,0.0,0.0,0


In [4]:
train.shape, test.shape

((15081, 81), (1529, 81))

In [5]:
len(train['search_id'].unique())

1000

In [6]:
len(test['search_id'].unique())

100

In [7]:
len(set(train['search_id'].values).intersection(set(test['search_id'].values)))

0

In [8]:
train['search_id'].value_counts()

search_id
156182    20
8591      20
226704    20
227432    20
315998    20
          ..
155433     1
271166     1
178343     1
387764     1
303366     1
Name: count, Length: 1000, dtype: int64

In [9]:
# доля 1 
np.mean(train['target'])

0.021351369272594657

# Самый простой бейзлайн

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train.iloc[:, 1:-1].values, train['target'].values, test_size=0.3, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

predictions = model.predict_proba(X_val)[:, 1]

In [11]:
from sklearn.metrics import ndcg_score
from sklearn.metrics import roc_auc_score

# Функция для вычисления NDCG на основе фактических и предсказанных рейтингов
def calculate_ndcg(y_true, y_score):
    true_relevance = np.asarray([y_true])
    scores = np.asarray([y_score])
    ndcg = ndcg_score(true_relevance, scores)
    return ndcg

# Вычисляем NDCG для валидационной выборки
calculate_ndcg(y_val, predictions)


0.54112926198737

In [12]:
calculate_ndcg(test['target'].values, model.predict(test.iloc[:, 1:-1].values))

0.3899032693596343

In [13]:
roc_auc_score(test['target'].values, model.predict(test.iloc[:, 1:-1].values))

0.5

# Обработка признаков

In [15]:
X_train = train.drop(columns=['search_id', 'target'])
y_train = train['target']

X_test = test.drop(columns=['search_id', 'target'])
y_test = test['target']

In [16]:
# Определяем константные признаки в обучающем наборе данных
constant_columns_train = [col for col in X_train.columns if X_train[col].nunique() == 1]

# Проверяем константные признаки в тестовом наборе данных
constant_columns_test = [col for col in X_test.columns if X_test[col].nunique() == 1]

# Пересечение константных признаков в обучающем и тестовом наборе данных
constant_features = list(set(constant_columns_train) & set(constant_columns_test))

# Изучаем категориальные признаки (будем считать категориальными те признаки, которые имеют меньше 10 уникальных значений)
categorical_features_train = [col for col in X_train.columns if 1 < X_train[col].nunique() <= 10]
categorical_features_test = [col for col in X_test.columns if 1 < X_test[col].nunique() <= 10]

# Пересечение категориальных признаков в обучающем и тестовом наборе данных
categorical_features = list(set(categorical_features_train) & set(categorical_features_test))


In [17]:
from sklearn.preprocessing import OneHotEncoder

# Создаем объект OneHotEncoder, указывая drop='first' для избежания ловушки фиктивных переменных
encoder = OneHotEncoder(drop='first', sparse=False)

# Обучаем энкодер на категориальных признаках обучающего набора данных и преобразуем их
encoded_cats_train = encoder.fit_transform(X_train[categorical_features])

# Преобразуем категориальные признаки тестового набора данных
encoded_cats_test = encoder.transform(X_test[categorical_features])

# Посмотрим на размерность полученных массивов после кодирования
encoded_cats_train.shape, encoded_cats_test.shape




((15081, 46), (1529, 46))

In [19]:
# Сначала удалим из исходных данных константные и категориальные признаки
non_categorical_features = list(set(X_train.columns) - set(categorical_features) - set(constant_features))

# Формируем исходные некатегориальные данные для обучающего и тестового наборов данных
non_cat_train_df = X_train[non_categorical_features]
non_cat_test_df = X_test[non_categorical_features]

# Создаем DataFrame из закодированных категориальных данных для обучающего и тестового наборов
encoded_cats_train_df = pd.DataFrame(encoded_cats_train, columns=encoder.get_feature_names_out(categorical_features))
encoded_cats_test_df = pd.DataFrame(encoded_cats_test, columns=encoder.get_feature_names_out(categorical_features))

# Сбрасываем индексы, чтобы избежать проблем при конкатенации
non_cat_train_df.reset_index(drop=True, inplace=True)
encoded_cats_train_df.reset_index(drop=True, inplace=True)
non_cat_test_df.reset_index(drop=True, inplace=True)
encoded_cats_test_df.reset_index(drop=True, inplace=True)

# Конкатенируем некатегориальные и закодированные категориальные данные
final_train_df = pd.concat([non_cat_train_df, encoded_cats_train_df], axis=1)
final_test_df = pd.concat([non_cat_test_df, encoded_cats_test_df], axis=1)

# Возвращаем размерности итоговых датасетов и их столбцы для проверки
final_train_df.shape, final_test_df.shape

((15081, 106), (1529, 106))

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_train_df = scaler.fit_transform(final_train_df)
scaled_test_df = scaler.transform(final_test_df)

In [21]:
import statsmodels.api as sm
X_train_const = sm.add_constant(scaled_train_df)
model = sm.Logit(y_train, X_train_const).fit(disp=0)

model.llr_pvalue # <0.05 => есть связь между признаками и значениями класса




2.4884132257387267e-27

In [22]:
p_values = model.pvalues[1:]  # исключаем константу

# Определяем статистически не значимые признаки
insignificant_feature_indices = np.where(p_values > 0.05)[0]

# Удаление не значимых признаков из X_train и X_test
X_train_reduced = np.delete(
    scaled_train_df, insignificant_feature_indices, axis=1)
X_test_reduced = np.delete(
    scaled_test_df, insignificant_feature_indices, axis=1)

X_train_reduced.shape, X_test_reduced.shape


((15081, 21), (1529, 21))

In [23]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_reduced, y_train)
predictions = clf.predict_proba(X_test_reduced)[:, 1]

In [24]:
calculate_ndcg(y_test.values, predictions)

0.5978731006806096

In [25]:
roc_auc_score(y_test.values, predictions)

0.775368876647649