In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import pickle

In [37]:
#  Загрузка данных, показ первых 5 строк
data = pd.read_csv('bank-full.csv', sep=';') 
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [38]:
# Выбор признаков
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
data = data[features]
print("\nОтобранные признаки:\n", data.head())


Отобранные признаки:
    age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome   y  
0       261         1     -1         0  unknown  no  
1       151         1     -1         0  unknown  no  
2        76         1     -1         0  unknown  no  
3        92         1     -1         0  unknown  no  
4       198         1     -1         0  unknown  no  


In [39]:
# Проверка на наличие пропусков
print(data.isnull().sum())

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [40]:
# Вопрос 1. Какое самое частое значение (mode) для столбца education?
education = data['education']
mode_value = education.mode()[0]
print(f"Самое частое значение в столбце 'education': {mode_value}")

Самое частое значение в столбце 'education': secondary


In [41]:
# Отбор численных признаков
num_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
num_data = data[num_features]

# Расчет корреляционной матрицы для численных признаков
correlation_matrix = num_data.corr()
print("\nКорреляционная матрица:\n", correlation_matrix)


Корреляционная матрица:
                age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [42]:
# Вопрос 2. Какие два признака имеют наибольшую корреляцию?
max_correlation = 0
correlated_features = ('', '')
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        correlation = abs(correlation_matrix.iloc[i, j])
        if correlation > max_correlation:
            max_correlation = correlation
            correlated_features = (correlation_matrix.columns[i], correlation_matrix.columns[j])

print(f"Наибольшая корреляция: {max_correlation} между {correlated_features[0]} и {correlated_features[1]}")

Наибольшая корреляция: 0.4548196354805043 между pdays и previous


In [43]:
# Подготовка данных
data = pd.read_csv('bank-full.csv', sep=';')
data = data[features]
# Замена 'yes' на 1 и 'no' на 0 в столбце 'y'
data['y'] = data['y'].map({'yes': 1, 'no': 0})
print(data.head())

   age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome  y  
0       261         1     -1         0  unknown  0  
1       151         1     -1         0  unknown  0  
2        76         1     -1         0  unknown  0  
3        92         1     -1         0  unknown  0  
4       198         1     -1         0  unknown  0  


In [44]:
# Кодирование категориальных переменных
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']):
    data[column] = label_encoder.fit_transform(data[column])
    
# Выбор категориальных призников
features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
X = data[features]
# y отдельно
Y = data['y']

# Разделение данных на тренировочный, валидационный и тестовый наборы
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4, random_state=42) 
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)
print(f"Тренировочный набор: {X_train.shape}, {Y_train.shape}")
print(f"Валидационный набор: {X_val.shape}, {Y_val.shape}")
print(f"Тестовый набор: {X_test.shape}, {Y_test.shape}")

# Вычисление взаимной информации
mutual_info = mutual_info_classif(X_train, Y_train, discrete_features=True)
mutual_info_df = pd.DataFrame(mutual_info, index=X_train.columns, columns=['Mutual Information'])
mutual_info_df['Mutual Information'] = mutual_info_df['Mutual Information'].round(2)
print(mutual_info_df)

# Вопрос 3. Какая из переменных имеет наибольшую взаимную информацию?
max_mutual_info_index = mutual_info_df['Mutual Information'].idxmax()
print(f"\nПеременная с наибольшей взаимной информацией: {max_mutual_info_index}")

Тренировочный набор: (27126, 7), (27126,)
Валидационный набор: (9042, 7), (9042,)
Тестовый набор: (9043, 7), (9043,)
           Mutual Information
job                      0.01
marital                  0.00
education                0.00
housing                  0.01
contact                  0.01
month                    0.02
poutcome                 0.03

Переменная с наибольшей взаимной информацией: poutcome


In [45]:
# One-hot кодирование
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)

# Обучение модели логистической регрессии
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, Y_train)

# Вопрос 4. Определить точность на валидационном наборе данных
Y_predicted = model.predict(X_val_encoded)
accuracy = accuracy_score(Y_val, Y_predicted)
print(f"\nТочность на валидационном наборе: {accuracy:.2f}")


Точность на валидационном наборе: 0.89


In [54]:
#  Выбор всех признаков
all_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'age', 'balance', 'previous', 'duration', 'pdays', 'campaign']
X_all = data[all_features]
y_all = data['y']

#  Разделение полных данных
X_train_full, X_temp_full, y_train_full, y_temp_full = train_test_split(X_all, y_all, test_size=0.4, random_state=42)
X_val_full, X_test_full, y_val_full, y_test_full = train_test_split(X_temp_full, y_temp_full, test_size=0.5, random_state=42)

#  One-hot кодирование
categorical_cols = X_all.select_dtypes(include=['object']).columns

encoder_full = OneHotEncoder(sparse_output=False, drop='first')
X_encoded_full = encoder_full.fit_transform(X_all[categorical_cols])
feature_names_full = encoder_full.get_feature_names_out(categorical_cols)
X_encoded_df = pd.DataFrame(X_encoded_full, columns=feature_names_full)
numerical_cols_full = X_all.select_dtypes(include=['number']).columns
X_final_data = pd.concat([X_encoded_df, X_all[numerical_cols_full].reset_index(drop=True)], axis=1)

# 23. Разделение final dataset
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_final_data, y_all, test_size=0.2, random_state=42)
X_val_final, X_test_final, y_val_final, y_test_final = train_test_split(X_test_final, y_test_final, test_size=0.5, random_state=42)

# Инициализируем словарь для хранения разницы в точности
accuracy_diff = {}

# Исключаем каждый признак по одному и оцениваем изменение точности
for feature in all_features:
    # Исключаем текущий признак
    X_train_excluded = X_train_final.drop(columns=[feature])
    X_val_excluded = X_val_final.drop(columns=[feature])
    X_train_encoded = encoder.fit_transform(X_train_excluded)
    X_val_encoded = encoder.transform(X_val_excluded)
    # Обучаем модель на данных без текущего признака
    model_excluded = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_excluded.fit(X_train_encoded, y_train_final)
    
    # Рассчитываем точность модели
    accuracy_excluded = accuracy_score(y_val_final, model_excluded.predict(X_val_encoded))
    
    # Разница в точности
    accuracy_diff[feature] = accuracy - accuracy_excluded

# Сортируем разницу в точности по возрастанию
sorted_accuracy_diff = sorted(accuracy_diff.items(), key=lambda x: x[1])

# Выводим разницу всех признаков
sorted_accuracy_diff[:]

[('age', -0.010395930103959339),
 ('balance', -0.010174740101747348),
 ('previous', -0.00928998009289983),
 ('job', -0.00884760008847596),
 ('education', -0.0084052200840522),
 ('contact', -0.0084052200840522),
 ('campaign', -0.008184030081840321),
 ('marital', -0.007962840079628442),
 ('housing', -0.007299270072992692),
 ('pdays', -0.007299270072992692),
 ('month', -0.005972130059721303),
 ('poutcome', -0.005750940057509424),
 ('duration', -0.004202610042026045)]

In [55]:
# Вопрос 5.
print("Наименьшую разницу имеет признак marital")

Наименьшую разницу имеет признак marital


In [56]:
# Подбор гиперпараметра C
C_values_list = [0.01, 0.1, 1, 10]
best_accuracy_val = 0
optimal_C = None

for C_val in C_values_list:
    model_full = LogisticRegression(solver='liblinear', C=C_val, max_iter=1000, random_state=42)
    model_full.fit(X_train_final, y_train_final)
    y_predicted_val = model_full.predict(X_val_full)
    accuracy_score_val = accuracy_score(y_val_full, y_predicted_val)
    print(f"Точность для C = {C_val}: {accuracy_score_val:.3f}")

    if accuracy_score_val > best_accuracy_val:
        best_accuracy_val = accuracy_score_val
        optimal_C = C_val
# Вопрос 6.
print(f"\nНаилучшая точность: {best_accuracy_val:.3f} (C = {optimal_C})")

Точность для C = 0.01: 0.889
Точность для C = 0.1: 0.891
Точность для C = 1: 0.890
Точность для C = 10: 0.890

Наилучшая точность: 0.891 (C = 0.1)
