### Домашняя работа к Уроку 6
### Студент: Абрамов А.В.

#### 1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

In [2]:
import pandas as pd
import numpy as np
from io import StringIO
import requests

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

import warnings
warnings.filterwarnings('ignore')

In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00562/Shill Bidding Dataset.csv"
data_text = requests.get(url).text
data_file = StringIO(data_text)
#data = pd.read_csv(data_file, index_col='Record_ID')
data = pd.read_csv(data_file)
data.head(3)

Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0


In [4]:
df = data.copy()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6321 entries, 0 to 6320
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Record_ID               6321 non-null   int64  
 1   Auction_ID              6321 non-null   int64  
 2   Bidder_ID               6321 non-null   object 
 3   Bidder_Tendency         6321 non-null   float64
 4   Bidding_Ratio           6321 non-null   float64
 5   Successive_Outbidding   6321 non-null   float64
 6   Last_Bidding            6321 non-null   float64
 7   Auction_Bids            6321 non-null   float64
 8   Starting_Price_Average  6321 non-null   float64
 9   Early_Bidding           6321 non-null   float64
 10  Winning_Ratio           6321 non-null   float64
 11  Auction_Duration        6321 non-null   int64  
 12  Class                   6321 non-null   int64  
dtypes: float64(8), int64(4), object(1)
memory usage: 642.1+ KB


In [6]:
df.isnull().sum() # значений Null нет

Record_ID                 0
Auction_ID                0
Bidder_ID                 0
Bidder_Tendency           0
Bidding_Ratio             0
Successive_Outbidding     0
Last_Bidding              0
Auction_Bids              0
Starting_Price_Average    0
Early_Bidding             0
Winning_Ratio             0
Auction_Duration          0
Class                     0
dtype: int64

In [7]:
df.duplicated().sum() # дублей нет

0

In [8]:
df.drop(['Bidder_ID','Record_ID'], axis=1, inplace=True) # дропаем столбцы с ID - никакой информации, кроме идентификатора

In [9]:
df.head()

Unnamed: 0,Auction_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,732,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,732,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,732,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,732,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,900,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


In [10]:
# Нормализуем 2 столбца: Auction_ID и Auction_Duration
df1 = df[['Auction_ID','Auction_Duration']]
df2 = df.drop(columns = ['Auction_ID','Auction_Duration'])

minmax = MinMaxScaler().fit(df1)
minmax = minmax.transform(df1)

df1 = pd.DataFrame(minmax, columns = ['Auction_ID','Auction_Duration'])
df = df1.join(df2, how='outer')
df.head()

Unnamed: 0,Auction_ID,Auction_Duration,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Class
0,0.287011,0.444444,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,0
1,0.287011,0.444444,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,0
2,0.287011,0.444444,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,0
3,0.287011,0.444444,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,0
4,0.353336,0.666667,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,0


In [11]:
# разделяем данные на positives и unlabeled
positives = df[df['Class'] == 1] 
unlabeled = df[df['Class'] == 0]

In [12]:
# формируем выборку с негативными примерами, объемом, как с позитивными
n_samples = len(positives)
negative_samples = resample(unlabeled, replace=False, n_samples=n_samples, random_state=42)

In [13]:
# получаем сбалансированнную выборку, где поровну и позитивов, и негативов
balanced_df = pd.concat([positives, negative_samples])

#### 2. Обучить любой классификатор (какой вам нравится)

In [14]:
X = balanced_df.drop('Class', axis=1) # признаки
y = balanced_df['Class'] # целевая переменная

In [15]:
X_train, X_test, y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [16]:
# будем применять модель логистич. регрессии
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [17]:
y_pred = model.predict(X_test) # предсказанные тестовые значения

In [18]:
# Оценка модели
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc = roc_auc_score(Y_test, y_pred)

# Вывод результатов
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Roc_auc_score:", roc)

Accuracy: 0.9777777777777777
Precision: 0.958041958041958
Recall: 1.0
F1-score: 0.9785714285714286
Roc_auc_score: 0.9774436090225564


In [20]:
metrix['accuracy'].append(round(accuracy_score(Y_test, y_pred), 4))
metrix['ROC AUC'].append(round(roc_auc_score(Y_test, y_pred), 4))
metrix['precision'].append(round(precision_score(Y_test, y_pred), 4))
metrix['recall'].append(round(recall_score(Y_test, y_pred), 4))
metrix['F-score'].append(round(f1_score(Y_test, y_pred), 4))

#### 3. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть

In [22]:
# Разделение на положительные и неразмеченные примеры
positives = df[df['Class'] == 1].sample(frac=0.3, random_state=17)  # Выбираем только 30% положительных примеров
unlabeled = df[df['Class'] == 0]

# Вывод информации о размере полученных множеств
print("Размер множества P (positives):", len(positives))
print("Размер множества U (unlabeled):", len(unlabeled))

Размер множества P (positives): 202
Размер множества U (unlabeled): 5646


#### 4. Применить random negative sampling для построения классификатора в новых условиях

In [23]:
# Выделение признаков и целевой переменной
x_data = df.copy()
y_data = x_data.pop('Class')

# Разделение на обучающую и тестовую выборки
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=17)

# Создание и обучение модели Logistic Regression
model = LogisticRegression()
model.fit(x_train, y_train)

# Прогнозирование на тестовых данных
y_pred = model.predict(x_test)

# Оценка модели
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)

# Вывод результатов
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Roc_auc_score:", roc)

Accuracy: 0.9699604743083003
Precision: 0.8741258741258742
Recall: 0.8620689655172413
F1-score: 0.8680555555555555
Roc_auc_score: 0.9229987684729065


In [25]:
metrix['accuracy'].append(round(accuracy_score(y_test, y_pred), 4))
metrix['ROC AUC'].append(round(roc_auc_score(y_test, y_pred), 4))
metrix['precision'].append(round(precision_score(y_test, y_pred), 4))
metrix['recall'].append(round(recall_score(y_test, y_pred), 4))
metrix['F-score'].append(round(f1_score(y_test, y_pred), 4))

#### 5. Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)

In [26]:
# Сформируем рыбу для наполнения сводной таблицы значений метрик

#metrix = {'model':['Logistic Regression', 'Random Negative Sampling'],'accuracy':[], 'ROC AUC':[], 'precision':[], 'recall':[], 'F-score':[]}
metrix

{'model': ['Logistic Regression', 'Random Negative Sampling'],
 'accuracy': [0.9778, 0.97],
 'ROC AUC': [0.9774, 0.923],
 'precision': [0.958, 0.8741],
 'recall': [1.0, 0.8621],
 'F-score': [0.9786, 0.8681]}

In [27]:
metrix_table = pd.DataFrame(metrix)
metrix_table

Unnamed: 0,model,accuracy,ROC AUC,precision,recall,F-score
0,Logistic Regression,0.9778,0.9774,0.958,1.0,0.9786
1,Random Negative Sampling,0.97,0.923,0.8741,0.8621,0.8681


In [None]:
# Логистическая регрессия оказалась точнее по всем показателям