In [ ]:
# 查看当前kernel下已安装的包  list packages
!pip list --format=columns

In [ ]:
# 安装拓展包
pip install some_package --user

In [ ]:
# 举例：安装sklearn某依赖库
!pip install sklearn2pmml --user

In [ ]:
# 举例：绘图案例 an example of matplotlib
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import jn
from IPython.display import display, clear_output
import time
x = np.linspace(0,5)
f, ax = plt.subplots()
ax.set_title("Bessel functions")

for n in range(1,10):
    time.sleep(1)
    ax.plot(x, jn(x,n))
    clear_output(wait=True)
    display(f)

# close the figure at the end, so we don't get a duplicate
# of the last plot
plt.close()


In [ ]:
import pandas as pd
import numpy as np
df=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
df['subscribe'].value_counts() 
#####

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
 
bins = [0, 143, 353, 1873, 5149]
df1 = df[df['subscribe'] == 'yes']
binning = pd.cut(df1['duration'], bins, right=False)
time = pd.value_counts(binning)
 
# 可视化
time = time.sort_index()
fig = plt.figure(figsize=(6, 2), dpi=120)
sns.barplot(x=time.index, y=time, color='royalblue')  # 更新此行代码，指定x和y参数
x = np.arange(len(time))
y = time.values
 
for x_loc, jobs in zip(x, y):
    plt.text(x_loc, jobs + 2, '{:.1f}%'.format(jobs / sum(time) * 100), ha='center', va='bottom', fontsize=8)
 
plt.xticks(fontsize=8)
plt.yticks([])
plt.ylabel('')
plt.title('duration_yes', size=8)
sns.despine(left=True)
plt.show()

In [ ]:
# 分离数值变量与分类变量
Nu_feature = list(df.select_dtypes(exclude=['object']).columns)  
Ca_feature = list(df.select_dtypes(include=['object']).columns)
#查看训练集与测试集数值变量分布
import matplotlib.pyplot as plt       
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
plt.figure(figsize=(20,15))
i=1
for col in Nu_feature:
    ax=plt.subplot(4,4,i)
    ax=sns.kdeplot(df[col],color='red')
    ax=sns.kdeplot(test[col],color='cyan')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')
    ax=ax.legend(['train','test'])
    i+=1
plt.show()
# 查看分类变量分布
Ca_feature.remove('subscribe')
col1=Ca_feature
plt.figure(figsize=(20,10))
j=1
for col in col1:
    ax=plt.subplot(4,5,j)
    ax=plt.scatter(x=range(len(df)),y=df[col],color='red')
    plt.title(col)
    j+=1
k=11
for col in col1:
    ax=plt.subplot(4,5,k)
    ax=plt.scatter(x=range(len(test)),y=test[col],color='cyan')
    plt.title(col)
    k+=1
plt.subplots_adjust(wspace=0.4,hspace=0.3)  
plt.show()

In [ ]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()                               
cols = Ca_feature
for m in cols:
    df[m] = lb.fit_transform(df[m])
    test[m] = lb.fit_transform(test[m])
 
df['subscribe']=df['subscribe'].replace(['no','yes'],[0,1])
 
correlation_matrix=df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(correlation_matrix,vmax=0.9,linewidths=0.05,cmap="RdGy")
# 几个相关性比较高的特征在模型的特征输出部分也占据比较重要的位置

In [ ]:
from sklearn import preprocessing
# 数据预处理
x = df1[['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','subscribe']].values
encoder=preprocessing.OrdinalEncoder()
encoder.fit(x)
x=encoder.transform(x)
df1[['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','subscribe']]=x
df1.head()

In [ ]:
import numpy as np  
  
# 归一化 X 数据  
def normalize_data(data):  
    # 计算数据的最小值和最大值  
    data_min = np.min(data, axis=0)  
    data_max = np.max(data, axis=0)  
    # 归一化数据到 [0, 1] 区间  
    normalized_data = (data - data_min) / (data_max - data_min)  
    return normalized_data, data_min, data_max  
  
normalized_X, X_min, X_max = normalize_data(df1)  
 
df1=normalized_X
df1.to_csv('train11.csv',index=False)
df1

In [ ]:
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd
import numpy as np
 
# 数据准备
X = df.drop(columns=['id', 'subscribe'])
Y = df['subscribe']
test = test.drop(columns='id')
 
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
 
# 建立模型
gbm = LGBMClassifier(n_estimators=662, learning_rate=0.01, boosting_type='gbdt',  
                      objective='binary', max_depth=-1,
                      random_state=2020, metric='auc')
 
# 交叉验证
result1 = []
mean_score1 = 0
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=2022)
for train_index, test_index in kf.split(X):
    x_train_fold, x_val_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_val_fold = Y.iloc[train_index], Y.iloc[test_index]
    
    gbm_fold = LGBMClassifier(n_estimators=662, learning_rate=0.01, boosting_type='gbdt',  
                               objective='binary', max_depth=-1,
                               random_state=2020, metric='auc')
    
    gbm_fold.fit(x_train_fold, y_train_fold)
    
    y_pred1 = gbm_fold.predict_proba(x_val_fold)[:, 1]
    print('验证集AUC: {}'.format(roc_auc_score(y_val_fold, y_pred1)))
    
    mean_score1 += roc_auc_score(y_val_fold, y_pred1) / n_folds
    
    y_pred_final1 = gbm_fold.predict_proba(test)[:, 1]
    result1.append(y_pred_final1)
 
# 模型评估
print('mean 验证集auc: {}'.format(mean_score1))
cat_pre1 = np.sum(result1, axis=0) / n_folds
ret1 = pd.DataFrame(cat_pre1, columns=['subscribe'])
ret1['subscribe'] = np.where(ret1['subscribe'] > 0.5, 'yes', 'no').astype('str')
ret1.to_csv('GBM预测.csv', index=False)

In [ ]:
import shap
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
 
# 加载示例数据集
data = load_breast_cancer()
X = data.data
y = data.target
 
# 拟合LightGBM模型
gbm = lgb.LGBMClassifier()
gbm.fit(X, y)
 
# 使用SHAP解释模型
explainer = shap.TreeExplainer(gbm)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X, plot_type="bar", max_display=20)