In [72]:
import pandas as pd
import numpy as np
 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import pickle

df = pd.read_csv('bank-full.csv', sep=';')
columns= [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
    'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]
df = df[columns]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [73]:
df.isnull().sum()
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [75]:
mode = df['education'].mode()

print(mode)

0    secondary
Name: education, dtype: object


In [76]:
data = pd.read_csv('bank-full.csv', sep=';')

selected_features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
data = data[selected_features]

# Заменяем 'yes' на 1 и 'no' на 0 в столбце 'y'
data['y'] = data['y'].map({'yes': 1, 'no': 0})
print(data.head())

   age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome  y  
0       261         1     -1         0  unknown  0  
1       151         1     -1         0  unknown  0  
2        76         1     -1         0  unknown  0  
3        92         1     -1         0  unknown  0  
4       198         1     -1         0  unknown  0  


In [77]:
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

selected_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
X = data[selected_features]

y = data['y']

# Разделяем данные на тренировочный, валидационный и тестовый наборы
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42) 
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Тренировочный набор: {X_train.shape}, {y_train.shape}")
print(f"Валидационный набор: {X_val.shape}, {y_val.shape}")
print(f"Тестовый набор: {X_test.shape}, {y_test.shape}")

mutual_info = mutual_info_classif(X_train, y_train, discrete_features=True)

mutual_info_df = pd.DataFrame(mutual_info, index=X_train.columns, columns=['Mutual Information'])
mutual_info_df['Mutual Information'] = mutual_info_df['Mutual Information'].round(2)
print(mutual_info_df)

# Находим переменную с наибольшей взаимной информацией
max_mutual_info_index = mutual_info_df['Mutual Information'].idxmax()
print(f"\nПеременная с наибольшей взаимной информацией: {max_mutual_info_index}")

Тренировочный набор: (27126, 7), (27126,)
Валидационный набор: (9042, 7), (9042,)
Тестовый набор: (9043, 7), (9043,)
           Mutual Information
job                      0.01
marital                  0.00
education                0.00
housing                  0.01
contact                  0.01
month                    0.02
poutcome                 0.03

Переменная с наибольшей взаимной информацией: poutcome


In [78]:
ohe = OneHotEncoder(handle_unknown='ignore') 
X_train = ohe.fit_transform(X_train)
X_val = ohe.transform(X_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

# Рассчитываем точность
accuracy = accuracy_score(y_val, y_pred)
print(f"Точность на валидационном наборе: {accuracy:.2f}")

Точность на валидационном наборе: 0.89


In [80]:
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])

selected_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'age', 'balance', 'previous', 'duration', 'pdays', 'campaign']
X = data[selected_features]

y = data['y']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42) 
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
categorical_cols = X.select_dtypes(include=['object']).columns

encoder = OneHotEncoder(sparse_output=False, drop='first')
X_encoded = encoder.fit_transform(X[categorical_cols])

feature_names = encoder.get_feature_names_out(categorical_cols)

X_train_df = pd.DataFrame(X_encoded, columns=feature_names)

numeric_cols = X.select_dtypes(include=['number']).columns

X_train_df = pd.concat([X_train_df, X[numeric_cols].reset_index(drop=True)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_train_df, y, test_size=0.2, random_state=42)

mutual_info = mutual_info_classif(X_train, y_train)

mutual_info_df = pd.DataFrame(mutual_info, index=X_train.columns, columns=['Mutual Info']).sort_values(by='Mutual Info', ascending=False)

print(mutual_info_df.head())

          Mutual Info
duration     0.070319
poutcome     0.033417
month        0.026608
pdays        0.024865
balance      0.019635


In [81]:
C_values = [0.01, 0.1, 1, 10]
best_accuracy = 0
best_C = None

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    print(f"Точность для C = {C}: {accuracy:.3f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C

print(f"\nНаилучшая точность: {best_accuracy:.3f} (C = {best_C})")

Точность для C = 0.01: 0.890
Точность для C = 0.1: 0.891
Точность для C = 1: 0.891
Точность для C = 10: 0.890

Наилучшая точность: 0.891 (C = 0.1)
