In [None]:
import pandas as pd

# データの読み込み
df = pd.read_csv('bank.csv', sep=';')

# 1. 全ての列名を表示
print("列名:")
print(df.columns)

# 2. 各列のデータ型を表示
print("\n各列のデータ型:")
print(df.dtypes)

# 3. 特定の列のユニークな値を表示（例として「job」列）
print("\n「job」列のユニークな値:")
print(df['job'].unique())

# 4. データセットの基本情報を表示
print("\nデータセットの基本情報:")
print(df.info())

# 5. 最初の数行のデータを表示
print("\n最初の5行のデータ:")
print(df.head())

# 6. 数値列の記述統計情報を表示
print("\n数値列の記述統計情報:")
print(df.describe())

# カテゴリ列の記述統計情報を表示
print("\nカテゴリ列の記述統計情報:")
print(df.describe(include='object'))





列名:
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

各列のデータ型:
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

「job」列のユニークな値:
['unemployed' 'services' 'management' 'blue-collar' 'self-employed'
 'technician' 'entrepreneur' 'admin.' 'student' 'housemaid' 'retired'
 'unknown']

データセットの基本情報:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, make_scorer
from scipy.stats import ttest_rel
# 加载数据
df = pd.read_csv('bank.csv', sep=';')

# 定义处理方法
def affine_transform(series):
    q_low = series.quantile(0.05)
    q_high = series.quantile(0.95)
    return 2 * (series - q_low) / (q_high - q_low) - 1

def linear_transform(series):
    q_high = series.quantile(0.95)
    return series / q_high

# 1. age: u, cn
df['age'] = affine_transform(df['age'])

# 2. job: c
df = pd.get_dummies(df, columns=['job'], drop_first=True)

# 3. marital: c
df = pd.get_dummies(df, columns=['marital'], drop_first=True)

# 4. education: o
education_mapping = {'primary': 1, 'secondary': 2, 'tertiary': 3, 'unknown': 0}
df['education'] = df['education'].map(education_mapping)

# 5. default: o
default_mapping = {'no': 0, 'yes': 1}
df['default'] = df['default'].map(default_mapping)

# 6. balance: u, cn
df['balance'] = affine_transform(df['balance'])

# 7. housing: o
housing_mapping = {'no': 0, 'yes': 1}
df['housing'] = df['housing'].map(housing_mapping)

# 8. loan: o
loan_mapping = {'no': 0, 'yes': 1}
df['loan'] = df['loan'].map(loan_mapping)

# 9. contact: u, c
df = pd.get_dummies(df, columns=['contact'], drop_first=True)
df = pd.get_dummies(df, columns=['month'], drop_first=True)

# 10. day: u, cn
df['day'] = affine_transform(df['day'])

# 11. duration: cn
df['duration'] = affine_transform(df['duration'])

# 12. campaign: u, n
df['campaign'] = linear_transform(df['campaign'])

# 13. pdays: del
df = df.drop(columns=['pdays'])

# 14. previous: u, n
df['previous'] = linear_transform(df['previous'])

# 15. poutcome: u, c
df = pd.get_dummies(df, columns=['poutcome'], drop_first=True)
df['y'] = df['y'].map({'yes': 1, 'no': -1})

# 计算 PRBEP


X = df.drop(columns=['y'])
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    stratify=y,
    random_state=0
)
def precision_recall_break_even_point(y_true, y_pred_proba):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
    diff = np.abs(precision - recall)
    idx = np.argmin(diff)
    return (precision[idx] + recall[idx]) / 2

def prbep_scorer(estimator, X, y):
    y_pred_proba = estimator.predict_proba(X)[:, 1]
    return precision_recall_break_even_point(y, y_pred_proba)

# Define the parameter grid
param_grid = {
    'C': np.logspace(-4, 4, 20),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # liblinear supports both L1 and L2
}

# Create a logistic regression model
logreg = LogisticRegression(random_state=0)

# Perform grid search
grid_search = GridSearchCV(
    logreg,
    param_grid,
    scoring=prbep_scorer,
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best PRBEP score: {best_score}")





Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best parameters: {'C': 0.615848211066026, 'penalty': 'l1', 'solver': 'liblinear'}
Best PRBEP score: 0.5589041095890411


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, make_scorer
from scipy.stats import ttest_ind

# 加载数据
df = pd.read_csv('bank.csv', sep=';')

# 定义处理方法
def affine_transform(series):
    q_low = series.quantile(0.05)
    q_high = series.quantile(0.95)
    return 2 * (series - q_low) / (q_high - q_low) - 1

def linear_transform(series):
    q_high = series.quantile(0.95)
    return series / q_high

# 预处理数据
df['age'] = affine_transform(df['age'])
df['balance'] = affine_transform(df['balance'])
df['day'] = affine_transform(df['day'])
df['duration'] = affine_transform(df['duration'])
df['campaign'] = linear_transform(df['campaign'])
df['previous'] = linear_transform(df['previous'])

df = pd.get_dummies(df, columns=['job', 'marital', 'contact', 'poutcome', 'month'], drop_first=True)

df['y'] = df['y'].map({'yes': 1, 'no': -1})

# 计算 PRBEP
def calculate_prbep(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    difference = np.abs(precision - recall)
    break_even_point = np.argmin(difference)
    prbep = (precision[break_even_point] + recall[break_even_point]) / 2
    return prbep

# 自定义 PRBEP 评分函数
def prbep_scorer(y_true, y_pred):
    return calculate_prbep(y_true, y_pred)

# 交叉验证和 PRBEP 计算
def evaluate_model(X_train, y_train, X_test, y_test, norm_type):

    model = LogisticRegression(penalty=norm_type, solver='saga',max_iter=1000)
    param_grid = {'C': np.logspace(0, 10, 10)}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=make_scorer(prbep_scorer), error_score='raise')

    try:
        grid_search.fit(X_train, y_train)
    except ValueError as e:
        print(f"Error during GridSearchCV: {e}")
        return None

    best_model = grid_search.best_estimator_
    y_probs = best_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    prbep = calculate_prbep(y_test, y_probs)
    return prbep

# 初始化 PRBEP 存储
prbep_l1_scores = []
prbep_l2_scores = []

# 进行 10 次重复实验
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    # 计算 L1 正则化的 PRBEP
    prbep_l1 = evaluate_model(X_train, y_train, X_test, y_test, 'l1')
    if prbep_l1 is not None:
        prbep_l1_scores.append(prbep_l1)

    # 计算 L2 正则化的 PRBEP
    prbep_l2 = evaluate_model(X_train, y_train, X_test, y_test, 'l2')
    if prbep_l2 is not None:
        prbep_l2_scores.append(prbep_l2)

# 执行独立样本 t 检验
t_stat, p_value = ttest_ind(prbep_l1_scores, prbep_l2_scores, equal_var=False)

print(f'PRBEP L1 scores: {prbep_l1_scores}')
print(f'PRBEP L2 scores: {prbep_l2_scores}')
print(f'Independent T-statistic: {t_stat:.4f}')
print(f'Independent P-value: {p_value:.4f}')


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, make_scorer

# 加载数据
df = pd.read_csv('bank.csv', sep=';')

# 定义处理方法
def affine_transform(series):
    q_low = series.quantile(0.05)
    q_high = series.quantile(0.95)
    return 2 * (series - q_low) / (q_high - q_low) - 1

def linear_transform(series):
    q_high = series.quantile(0.95)
    return series / q_high

# 预处理数据
# 1. age: u, cn
df['age'] = affine_transform(df['age'])

# 2. job: c
df = pd.get_dummies(df, columns=['job'], drop_first=True)

# 3. marital: c
df = pd.get_dummies(df, columns=['marital'], drop_first=True)

# 4. education: o
education_mapping = {'primary': 1, 'secondary': 2, 'tertiary': 3, 'unknown': 0}
df['education'] = df['education'].map(education_mapping)

# 5. default: o
default_mapping = {'no': 0, 'yes': 1}
df['default'] = df['default'].map(default_mapping)

# 6. balance: u, cn
df['balance'] = affine_transform(df['balance'])

# 7. housing: o
housing_mapping = {'no': 0, 'yes': 1}
df['housing'] = df['housing'].map(housing_mapping)

# 8. loan: o
loan_mapping = {'no': 0, 'yes': 1}
df['loan'] = df['loan'].map(loan_mapping)

# 9. contact: u, c
df = pd.get_dummies(df, columns=['contact'], drop_first=True)
df = pd.get_dummies(df, columns=['month'], drop_first=True)

# 10. day: u, cn
df['day'] = affine_transform(df['day'])

# 11. duration: cn
df['duration'] = affine_transform(df['duration'])

# 12. campaign: u, n
df['campaign'] = linear_transform(df['campaign'])

# 13. pdays: del
df = df.drop(columns=['pdays'])

# 14. previous: u, n
df['previous'] = linear_transform(df['previous'])

# 15. poutcome: u, c
df = pd.get_dummies(df, columns=['poutcome'], drop_first=True)
df['y'] = df['y'].map({'yes': 1, 'no': -1})

# 计算 PRBEP
def calculate_prbep(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    difference = np.abs(precision - recall)
    break_even_point = np.argmin(difference)
    prbep = (precision[break_even_point] + recall[break_even_point]) / 2
    return prbep

# 自定义 PRBEP 评分函数
def prbep_scorer(y_true, y_pred):
    return calculate_prbep(y_true, y_pred)

# 交叉验证和 PRBEP 计算
def evaluate_model(X_train, y_train, X_test, y_test, norm_type):
    model = LogisticRegression(penalty=norm_type, solver='saga', max_iter=1000)
    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=make_scorer(prbep_scorer), error_score='raise')

    try:
        grid_search.fit(X_train, y_train)
    except ValueError as e:
        print(f"Error during GridSearchCV: {e}")
        return None

    best_model = grid_search.best_estimator_
    y_probs = best_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    prbep = calculate_prbep(y_test, y_probs)
    return prbep



# 初始化 PRBEP 存储
prbep_l1_scores = []
prbep_l2_scores = []

# 数据分割
X = df.drop(columns=['y'])
y = df['y']

# 分别进行 L1 和 L2 正则化的交叉验证
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=i)

    # 计算 L1 正则化的 PRBEP
    prbep_l1 = evaluate_model(X_train, y_train, X_test, y_test, 'l1')
    if prbep_l1 is not None:
        prbep_l1_scores.append(prbep_l1)

    # 计算 L2 正则化的 PRBEP
    prbep_l2 = evaluate_model(X_train, y_train, X_test, y_test, 'l2')
    if prbep_l2 is not None:
        prbep_l2_scores.append(prbep_l2)
# 交叉验证和 PRBEP 计算

# 输出结果
print(f'PRBEP L1 scores: {prbep_l1_scores}')
print(f'PRBEP L2 scores: {prbep_l2_scores}')

# 进行独立样本 t 检验
from scipy.stats import ttest_ind

t_stat, p_value = ttest_ind(prbep_l1_scores, prbep_l2_scores, equal_var=False)

print(f'Independent P-value: {p_value:.4f}')


PRBEP L1 scores: [0.5064102564102564, 0.5192307692307693, 0.5, 0.5448717948717948, 0.5512820512820513, 0.5512820512820513, 0.532051282051282, 0.5576923076923077, 0.532051282051282, 0.532051282051282]
PRBEP L2 scores: [0.5128205128205128, 0.5192307692307693, 0.4935897435897436, 0.5448717948717948, 0.5512820512820513, 0.5512820512820513, 0.532051282051282, 0.5448717948717948, 0.5192307692307693, 0.5192307692307693]
Independent T-statistic: 0.4453
Independent P-value: 0.6614


In [None]:
pip install ucimlrepo



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, make_scorer

from ucimlrepo import fetch_ucirepo

# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
X = bank_marketing.data.features
y = bank_marketing.data.targets

df = X.copy()

# 定义处理方法
def affine_transform(series):
    q_low = series.quantile(0.05)
    q_high = series.quantile(0.95)
    return 2 * (series - q_low) / (q_high - q_low) - 1

def linear_transform(series):
    q_high = series.quantile(0.95)
    return series / q_high

# 预处理数据
df['age'] = affine_transform(df['age'])

# 2. 对分类变量 'job' 进行 one-hot 编码
df = pd.get_dummies(df, columns=['job'], drop_first=True)

# 3. 对分类变量 'marital' 进行 one-hot 编码
df = pd.get_dummies(df, columns=['marital'], drop_first=True)

# 4. education: o
education_mapping = {'primary': 1, 'secondary': 2, 'tertiary': 3, 'unknown': 0}
df['education'] = df['education'].map(education_mapping)

# 5. default: o
default_mapping = {'no': 0, 'yes': 1}
df['default'] = df['default'].map(default_mapping)

# 6. balance: u, cn
df['balance'] = affine_transform(df['balance'])

# 7. housing: o
housing_mapping = {'no': 0, 'yes': 1}
df['housing'] = df['housing'].map(housing_mapping)

# 8. loan: o
loan_mapping = {'no': 0, 'yes': 1}
df['loan'] = df['loan'].map(loan_mapping)

# 9. contact: u, c
df = pd.get_dummies(df, columns=['contact'], drop_first=True)
df = pd.get_dummies(df, columns=['month'], drop_first=True)

# 10. day: u, cn
df['day_of_week'] = affine_transform(df['day_of_week'])

# 11. duration: cn
df['duration'] = affine_transform(df['duration'])

# 12. campaign: u, n
df['campaign'] = linear_transform(df['campaign'])

# 13. pdays: del
df = df.drop(columns=['pdays'])

# 14. previous: u, n
df['previous'] = linear_transform(df['previous'])

# 15. poutcome: u, c
df = pd.get_dummies(df, columns=['poutcome'], drop_first=True)

print(df.head())

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    stratify=y,
    random_state=0
)

# 计算 PRBEP
def calculate_prbep(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    difference = np.abs(precision - recall)
    break_even_point = np.argmin(difference)
    prbep = (precision[break_even_point] + recall[break_even_point]) / 2
    return prbep

# 自定义 PRBEP 评分函数
def prbep_scorer(y_true, y_pred):
    return calculate_prbep(y_true, y_pred)

# 交叉验证和 PRBEP 计算
from sklearn.metrics import make_scorer, precision_recall_curve, average_precision_score

def evaluate_model(X_train, y_train, X_test, y_test, norm_type):
    # 选择模型
    if norm_type == 'l2':
        model = SomeModel(normalize='l2')  # 根据需要替换为实际模型
    elif norm_type == 'l1':
        model = SomeModel(normalize='l1')  # 根据需要替换为实际模型
    else:
        raise ValueError("Invalid normalization type")

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # 计算 PRBEP
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    prbep = average_precision_score(y_test, y_pred_proba)

    return prbep

prbep_l2 = evaluate_model(X_train, y_train, X_test, y_test, 'l2')
print(f'PRBEP with L2 normalization: {prbep_l2:.4f}')
prbep_l1 = evaluate_model(X_train, y_train, X_test, y_test, 'l1')
print(f'PRBEP with L1 normalization: {prbep_l1:.4f}')

      age  education  default   balance  housing  loan  day_of_week  duration  \
0  0.9375        3.0        0 -0.220539        1     0    -0.846154 -0.368715   
1  0.0625        2.0        0 -0.932323        1     0    -0.846154 -0.675978   
2 -0.6250        2.0        0 -0.941414        1     1    -0.846154 -0.885475   
3  0.2500        NaN        0 -0.435017        1     0    -0.846154 -0.840782   
4 -0.6250        NaN        0 -0.941751        0     0    -0.846154 -0.544693   

   campaign  previous  ...  month_jan  month_jul  month_jun  month_mar  \
0     0.125       0.0  ...      False      False      False      False   
1     0.125       0.0  ...      False      False      False      False   
2     0.125       0.0  ...      False      False      False      False   
3     0.125       0.0  ...      False      False      False      False   
4     0.125       0.0  ...      False      False      False      False   

   month_may  month_nov  month_oct  month_sep  poutcome_other  \
0  

NameError: name 'SomeModel' is not defined

In [None]:
pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, make_scorer
# fetch dataset
statlog_australian_credit_approval = fetch_ucirepo(id=143)

# data (as pandas dataframes)
X = statlog_australian_credit_approval.data.features
y = statlog_australian_credit_approval.data.targets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    stratify=y,
    random_state=0
)

# 计算 PRBEP
def calculate_prbep(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    difference = np.abs(precision - recall)
    break_even_point = np.argmin(difference)
    prbep = (precision[break_even_point] + recall[break_even_point]) / 2
    return prbep

# 自定义 PRBEP 评分函数
def prbep_scorer(y_true, y_pred):
    return calculate_prbep(y_true, y_pred)

# 交叉验证和 PRBEP 计算
def evaluate_model(X_train, y_train, X_test, y_test, norm_type):
    model = LogisticRegression(penalty=norm_type, solver='saga', max_iter=10000)
    param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=make_scorer(prbep_scorer), error_score='raise')

    try:
        grid_search.fit(X_train, y_train)
    except ValueError as e:
        print(f"Error during GridSearchCV: {e}")
        return None

    best_model = grid_search.best_estimator_
    y_probs = best_model.predict_proba(X_test)[:, 1]  # 获取正类的概率
    print(f"Best model for {norm_type} regularization: {best_model}")
    prbep = calculate_prbep(y_test, y_probs)
    return prbep

prbep_l2 = evaluate_model(X_train, y_train, X_test, y_test, 'l2')
print(f'PRBEP with L2 normalization: {prbep_l2:.4f}')
prbep_l1 = evaluate_model(X_train, y_train, X_test, y_test, 'l1')
print(f'PRBEP with L1 normalization: {prbep_l1:.4f}')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best model for l2 regularization: LogisticRegression(C=0.01, max_iter=10000, solver='saga')
PRBEP with L2 normalization: 0.6739


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Best model for l1 regularization: LogisticRegression(C=1, max_iter=10000, penalty='l1', solver='saga')
PRBEP with L1 normalization: 0.6739
