# 环境配置

In [202]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text, export_graphviz

# 预测大类，铅钡玻璃，高钾玻璃

## 函数准备

In [203]:
def DecisionTreeClassify(X_train, y_train, X_test, names):
    # 创建实例
    clf = DecisionTreeClassifier(random_state=42)
    # 训练模型
    clf.fit(X_train, y_train)
    # 进行预测
    y_pred = clf.predict(X_test)
    # 绘制树结构
    rules = export_text(clf, feature_names=names)
    return clf, y_pred, rules


## 读取训练集数据

In [204]:
train_data = pd.read_excel('merged_excel.xlsx',sheet_name=1)
feature_lst =  ['二氧化硅(SiO2)', '氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)',  
              '氧化镁(MgO)', '氧化铝(Al2O3)', '氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)',  
              '氧化钡(BaO)', '五氧化二磷(P2O5)', '氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)']

train_Notwind = train_data[train_data['表面风化'] == 0]
train_wind = train_data[train_data['表面风化'] == 1]

train_Notwind_X = train_Notwind[feature_lst]
train_Notwind_y = train_Notwind['类型']

train_wind_X = train_wind[feature_lst]
train_wind_y = train_wind['类型']


## 读取待预测数据

In [205]:
pre_data = pd.read_excel('E:\python\GitHub_teamProject\Problem_2022C\Question\附件.xlsx',sheet_name=2)

In [206]:
pre_data = pre_data.fillna(0)
pre_Notwind = pre_data[pre_data['表面风化']== '无风化']
pre_Notwind_X = pre_Notwind[feature_lst]


In [207]:
pre_wind = pre_data[pre_data['表面风化']== '风化']
pre_wind_X = pre_wind[feature_lst]

## 进行XGboost预测

In [208]:
import xgboost as xgb

# 未风化XGBoost预测
# 初始化模型
model_nw = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')   
# 训练模型  
model_nw.fit(train_Notwind_X, train_Notwind_y)  
# 进行预测  
y_pred_nw = model_nw.predict(pre_Notwind_X)
# 还原数据
pre_Notwind.loc[:,'XGboost预测_玻璃类型'] = list(y_pred_nw)


Parameters: { "use_label_encoder" } are not used.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_Notwind.loc[:,'XGboost预测_玻璃类型'] = list(y_pred_nw)


In [209]:
# 风化XGBoost预测
# 初始化模型
model_w = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')   
# 训练模型  
model_w.fit(train_wind_X, train_wind_y)  
# 进行预测  
y_pred_w = model_nw.predict(pre_wind_X)
# 还原数据
pre_wind.loc[:,'XGboost预测_玻璃类型'] = list(y_pred_w)


Parameters: { "use_label_encoder" } are not used.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_wind.loc[:,'XGboost预测_玻璃类型'] = list(y_pred_w)


## 进行决策树预测

In [210]:
clf_nw, y_pred_nw, rules_nw = DecisionTreeClassify(train_Notwind_X, train_Notwind_y, pre_Notwind_X, feature_lst)
clf_w, y_pred_w, rules_w = DecisionTreeClassify(train_wind_X, train_wind_y, pre_wind_X, feature_lst)

In [211]:
pre_Notwind.loc[:,'决策树预测_玻璃类型'] = list(y_pred_nw)
pre_wind.loc[:,'决策树预测_玻璃类型'] = list(y_pred_w)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_Notwind.loc[:,'决策树预测_玻璃类型'] = list(y_pred_nw)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_wind.loc[:,'决策树预测_玻璃类型'] = list(y_pred_w)


In [212]:
print(rules_nw)
print(rules_w)

|--- 氧化钡(BaO) <= 3.18
|   |--- class: 1
|--- 氧化钡(BaO) >  3.18
|   |--- class: 0

|--- 二氧化硅(SiO2) <= 80.67
|   |--- class: 0
|--- 二氧化硅(SiO2) >  80.67
|   |--- class: 1



In [213]:
pre_Notwind.head()

Unnamed: 0,文物编号,表面风化,二氧化硅(SiO2),氧化钠(Na2O),氧化钾(K2O),氧化钙(CaO),氧化镁(MgO),氧化铝(Al2O3),氧化铁(Fe2O3),氧化铜(CuO),氧化铅(PbO),氧化钡(BaO),五氧化二磷(P2O5),氧化锶(SrO),氧化锡(SnO2),二氧化硫(SO2),XGboost预测_玻璃类型,决策树预测_玻璃类型
0,A1,无风化,78.45,0.0,0.0,6.08,1.86,7.23,2.15,2.11,0.0,0.0,1.06,0.03,0.0,0.51,1,1
2,A3,无风化,31.95,0.0,1.36,7.19,0.81,2.93,7.06,0.21,39.58,4.69,2.68,0.52,0.0,0.0,0,0
3,A4,无风化,35.47,0.0,0.79,2.89,1.05,7.07,6.45,0.96,24.28,8.31,8.45,0.28,0.0,0.0,0,0
7,A8,无风化,51.12,0.0,0.23,0.89,0.0,2.12,0.0,9.01,21.24,11.34,1.46,0.31,0.0,2.26,0,0


In [214]:
pre_wind.head()

Unnamed: 0,文物编号,表面风化,二氧化硅(SiO2),氧化钠(Na2O),氧化钾(K2O),氧化钙(CaO),氧化镁(MgO),氧化铝(Al2O3),氧化铁(Fe2O3),氧化铜(CuO),氧化铅(PbO),氧化钡(BaO),五氧化二磷(P2O5),氧化锶(SrO),氧化锡(SnO2),二氧化硫(SO2),XGboost预测_玻璃类型,决策树预测_玻璃类型
1,A2,风化,37.75,0.0,0.0,7.63,0.0,2.33,0.0,0.0,34.3,0.0,14.27,0.0,0.0,0.0,0,0
4,A5,风化,64.29,1.2,0.37,1.64,2.34,12.75,0.81,0.94,12.23,2.16,0.19,0.21,0.49,0.0,0,0
5,A6,风化,93.17,0.0,1.35,0.64,0.21,1.52,0.27,1.73,0.0,0.0,0.21,0.0,0.0,0.0,1,1
6,A7,风化,90.83,0.0,0.98,1.12,0.0,5.06,0.24,1.17,0.0,0.0,0.13,0.0,0.0,0.11,1,1


In [215]:
merged_PreKind = pd.concat([pre_wind,pre_Notwind])

In [216]:
merged_PreKind = merged_PreKind.sort_index()
merged_PreKind

Unnamed: 0,文物编号,表面风化,二氧化硅(SiO2),氧化钠(Na2O),氧化钾(K2O),氧化钙(CaO),氧化镁(MgO),氧化铝(Al2O3),氧化铁(Fe2O3),氧化铜(CuO),氧化铅(PbO),氧化钡(BaO),五氧化二磷(P2O5),氧化锶(SrO),氧化锡(SnO2),二氧化硫(SO2),XGboost预测_玻璃类型,决策树预测_玻璃类型
0,A1,无风化,78.45,0.0,0.0,6.08,1.86,7.23,2.15,2.11,0.0,0.0,1.06,0.03,0.0,0.51,1,1
1,A2,风化,37.75,0.0,0.0,7.63,0.0,2.33,0.0,0.0,34.3,0.0,14.27,0.0,0.0,0.0,0,0
2,A3,无风化,31.95,0.0,1.36,7.19,0.81,2.93,7.06,0.21,39.58,4.69,2.68,0.52,0.0,0.0,0,0
3,A4,无风化,35.47,0.0,0.79,2.89,1.05,7.07,6.45,0.96,24.28,8.31,8.45,0.28,0.0,0.0,0,0
4,A5,风化,64.29,1.2,0.37,1.64,2.34,12.75,0.81,0.94,12.23,2.16,0.19,0.21,0.49,0.0,0,0
5,A6,风化,93.17,0.0,1.35,0.64,0.21,1.52,0.27,1.73,0.0,0.0,0.21,0.0,0.0,0.0,1,1
6,A7,风化,90.83,0.0,0.98,1.12,0.0,5.06,0.24,1.17,0.0,0.0,0.13,0.0,0.0,0.11,1,1
7,A8,无风化,51.12,0.0,0.23,0.89,0.0,2.12,0.0,9.01,21.24,11.34,1.46,0.31,0.0,2.26,0,0


In [217]:
lst = ['文物编号','表面风化','决策树预测_玻璃类型','XGboost预测_玻璃类型']
result_kind =  merged_PreKind[lst]
result_kind

Unnamed: 0,文物编号,表面风化,决策树预测_玻璃类型,XGboost预测_玻璃类型
0,A1,无风化,1,1
1,A2,风化,0,0
2,A3,无风化,0,0
3,A4,无风化,0,0
4,A5,风化,0,0
5,A6,风化,1,1
6,A7,风化,1,1
7,A8,无风化,0,0


In [218]:
result_kind.to_latex()

'\\begin{tabular}{lllrr}\n\\toprule\n & 文物编号 & 表面风化 & 决策树预测_玻璃类型 & XGboost预测_玻璃类型 \\\\\n\\midrule\n0 & A1 & 无风化 & 1 & 1 \\\\\n1 & A2 & 风化 & 0 & 0 \\\\\n2 & A3 & 无风化 & 0 & 0 \\\\\n3 & A4 & 无风化 & 0 & 0 \\\\\n4 & A5 & 风化 & 0 & 0 \\\\\n5 & A6 & 风化 & 1 & 1 \\\\\n6 & A7 & 风化 & 1 & 1 \\\\\n7 & A8 & 无风化 & 0 & 0 \\\\\n\\bottomrule\n\\end{tabular}\n'

# 预测亚类

## 读取训练数据集

In [219]:
bariumLead_non = pd.read_excel('铅钡无风化.xlsx')
bariumLead = pd.read_excel('铅钡有风化.xlsx')
highK_non = pd.read_excel('高钾无风化.xlsx')
highK = pd.read_excel('高钾有风化.xlsx')

In [220]:
def get_feauture_label(data, feature,label):
    X_train = data[feature]
    y_train = data[label]
    return X_train,y_train

def get_feauture(data,feature):
    X_pred = data[feature]
    return X_pred

def XGboost_model(X_train,y_train,X_pred,result):
    # 未风化XGBoost预测
    # 初始化模型
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')   
    # 训练模型  
    model.fit(X_train, y_train)  
    # 进行预测  
    y_pred = model_nw.predict(X_pred)
    # 还原数据
    result.loc[:,'XGboost模型_预测玻璃亚类类型'] = list(y_pred)

## 提取待预测数据

In [221]:
# [bariumLead_non, bariumLead, highK_non, highK]
# 铅钡无风化
pre_bariumLead_non_data = merged_PreKind[(merged_PreKind['XGboost预测_玻璃类型'] == 0) & (merged_PreKind['表面风化'] == '无风化')]
# 铅钡有风化
pre_bariumLead_data = merged_PreKind[(merged_PreKind['XGboost预测_玻璃类型'] == 0) & (merged_PreKind['表面风化'] == '风化')]
# 高钾无风化
pre_highK_non_data = merged_PreKind[(merged_PreKind['XGboost预测_玻璃类型'] == 1) & (merged_PreKind['表面风化'] == '无风化')]
# 高钾有风化
pre_highK_data = merged_PreKind[(merged_PreKind['XGboost预测_玻璃类型'] == 1) & (merged_PreKind['表面风化'] == '风化')]

## XGboost亚类预测

In [222]:
train_data_lst = [bariumLead_non, bariumLead, highK_non, highK]
label_lst = ['铅钡无风化亚类分类结果', '铅钡有风化亚类分类结果', '高钾无风化亚类分类结果', '高钾有风化亚类分类结果']
pre_data_lst = [pre_bariumLead_non_data, pre_bariumLead_data, pre_highK_non_data, pre_highK_data]

for index in range(len(train_data_lst)):
    X_train,y_train = get_feauture_label(train_data_lst[index], feature_lst, label_lst[index])
    X_pred = get_feauture(pre_data_lst[index], feature_lst)
    XGboost_model(X_train,y_train,X_pred, pre_data_lst[index])


Parameters: { "use_label_encoder" } are not used.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result.loc[:,'XGboost模型_预测玻璃亚类类型'] = list(y_pred)
Parameters: { "use_label_encoder" } are not used.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result.loc[:,'XGboost模型_预测玻璃亚类类型'] = list(y_pred)
Parameters: { "use_label_encoder" } are not used.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result.loc[:,'XGboost模型_预测玻璃亚类类型'] = list(y_pred)


## 决策树亚类预测

In [223]:
train_data_lst = [bariumLead_non, bariumLead, highK_non, highK]
label_lst = ['铅钡无风化亚类分类结果', '铅钡有风化亚类分类结果', '高钾无风化亚类分类结果', '高钾有风化亚类分类结果']
pre_data_lst = [pre_bariumLead_non_data, pre_bariumLead_data, pre_highK_non_data, pre_highK_data]

for index in range(len(train_data_lst)):
    X_train,y_train = get_feauture_label(train_data_lst[index], feature_lst, label_lst[index])
    X_pred = get_feauture(pre_data_lst[index], feature_lst)
    clf, y_pred, rules = DecisionTreeClassify(X_train, y_train, X_pred, feature_lst)
    pre_data_lst[index].loc[:,'决策树预测_玻璃亚类类型'] = list(y_pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_data_lst[index].loc[:,'决策树预测_玻璃亚类类型'] = list(y_pred)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_data_lst[index].loc[:,'决策树预测_玻璃亚类类型'] = list(y_pred)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_data_lst[index].loc[:,'决策树预测_玻璃亚类类型'] = list(y_pred)
A value is trying to be set on a c

In [224]:
merged_PreSubKind = pd.concat(pre_data_lst)
merged_PreSubKind = merged_PreSubKind.sort_index()

In [225]:
merged_PreSubKind.columns

Index(['文物编号', '表面风化', '二氧化硅(SiO2)', '氧化钠(Na2O)', '氧化钾(K2O)', '氧化钙(CaO)',
       '氧化镁(MgO)', '氧化铝(Al2O3)', '氧化铁(Fe2O3)', '氧化铜(CuO)', '氧化铅(PbO)',
       '氧化钡(BaO)', '五氧化二磷(P2O5)', '氧化锶(SrO)', '氧化锡(SnO2)', '二氧化硫(SO2)',
       'XGboost预测_玻璃类型', '决策树预测_玻璃类型', 'XGboost模型_预测玻璃亚类类型', '决策树预测_玻璃亚类类型'],
      dtype='object')

In [227]:
final_result = merged_PreSubKind[['文物编号','表面风化','XGboost预测_玻璃类型','决策树预测_玻璃类型','XGboost模型_预测玻璃亚类类型','决策树预测_玻璃亚类类型']]

In [228]:
final_result

Unnamed: 0,文物编号,表面风化,XGboost预测_玻璃类型,决策树预测_玻璃类型,XGboost模型_预测玻璃亚类类型,决策树预测_玻璃亚类类型
0,A1,无风化,1,1,1,1
1,A2,风化,0,0,0,1
2,A3,无风化,0,0,0,1
3,A4,无风化,0,0,0,1
4,A5,风化,0,0,0,4
5,A6,风化,1,1,1,0
6,A7,风化,1,1,1,1
7,A8,无风化,0,0,0,0


In [None]:
final_result.to_latex()

'\\begin{tabular}{lllrrr}\n\\toprule\n & 文物编号 & 表面风化 & XGboost预测_玻璃类型 & 决策树预测_玻璃类型 & 预测玻璃亚类类型 \\\\\n\\midrule\n0 & A1 & 无风化 & 1 & 1 & 1 \\\\\n1 & A2 & 风化 & 0 & 0 & 0 \\\\\n2 & A3 & 无风化 & 0 & 0 & 0 \\\\\n3 & A4 & 无风化 & 0 & 0 & 0 \\\\\n4 & A5 & 风化 & 0 & 0 & 0 \\\\\n5 & A6 & 风化 & 1 & 1 & 1 \\\\\n6 & A7 & 风化 & 1 & 1 & 1 \\\\\n7 & A8 & 无风化 & 0 & 0 & 0 \\\\\n\\bottomrule\n\\end{tabular}\n'