# Import

In [31]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import classification_report, confusion_matrix
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

# Load Data

In [23]:
df_train = pd.read_csv('train.csv')
df_1 = pd.read_csv('b4ns.csv')
df_2 = pd.read_csv('b5ns.csv')
df_3 = pd.read_csv('res50.csv')
df_4 = pd.read_csv('res50d.csv')
order = df_1.iloc[:, 0].tolist()
df_train.set_index('image_id', inplace=True)
df_train_sorted = df_train.reindex(order)
df_train_sorted.reset_index(inplace=True)
df_train_sorted.to_csv('train_sorted.csv')

In [24]:
df_1_ = df_1.iloc[:, 1:]
df_2_ = df_2.iloc[:, 1:]
df_3_ = df_3.iloc[:, 1:]
df_4_ = df_4.iloc[:, 1:]
df_train_sorted_ = df_train_sorted.label
df_1_ = pd.concat([df_1_, df_2_, df_3_, df_4_, df_train_sorted_], axis=1)
df_1_.to_csv('all_pred.csv', index=False)

In [25]:
all_df = pd.read_csv('all_pred.csv')
x = all_df.iloc[:, :-1]
y = all_df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Definite the k-fold validation

In [26]:
def fusion_estimators(clf):
    """
    对融合模型做交叉验证，对融合模型的表现进行评估
    """
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    results = cross_validate(clf,x_train,y_train
                             ,cv = cv
                             ,scoring = "accuracy"
                             ,n_jobs = -1
                             ,return_train_score = True
                             ,verbose=False)
    test = clf.fit(x_train, y_train).score(x_test, y_test)
    print("train_score:{}".format(results["train_score"].mean())
          ,"\n cv_mean:{}".format(results["test_score"].mean())
          ,"\n test_score:{}".format(test)
         )

# Cross-Validation

In [27]:
# clf1 = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=200)
# clf2 = RandomForestRegressor(n_estimators=100, max_features="sqrt", max_samples=0.9, random_state=0, n_jobs=8)
# clf3 = LinearRegression()

In [28]:
lr = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=200)

# 训练模型
lr.fit(x_train, y_train)

# 预测测试集
y_pred = lr.predict(x_test)
print(pd.DataFrame(y_pred, columns=['label']))
# 输出分类报告和混淆矩阵
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# 输出模型准确率
print("Accuracy of the model: ", lr.score(x_test, y_test))

      label
0         3
1         3
2         2
3         1
4         1
...     ...
4275      3
4276      2
4277      1
4278      3
4279      4

[4280 rows x 1 columns]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       220
           1       0.89      0.84      0.87       470
           2       0.89      0.86      0.88       504
           3       0.97      0.98      0.98      2579
           4       0.81      0.82      0.81       507

    accuracy                           0.92      4280
   macro avg       0.87      0.86      0.86      4280
weighted avg       0.92      0.92      0.92      4280

[[ 174    8    4    3   31]
 [  23  397   11   10   29]
 [   4    7  433   33   27]
 [   2   10   21 2532   14]
 [  22   22   15   30  418]]
Accuracy of the model:  0.9238317757009346


In [29]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# 训练模型
rfc.fit(x_train, y_train)

# 预测测试集
y_pred = rfc.predict(x_test)

# 输出分类报告和混淆矩阵
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# 输出模型准确率
print("Accuracy of the model: ", rfc.score(x_test, y_test))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79       220
           1       0.89      0.84      0.87       470
           2       0.90      0.85      0.87       504
           3       0.97      0.98      0.98      2579
           4       0.80      0.83      0.82       507

    accuracy                           0.92      4280
   macro avg       0.87      0.86      0.87      4280
weighted avg       0.92      0.92      0.92      4280

[[ 176    7    3    4   30]
 [  22  397   10    8   33]
 [   5    9  429   32   29]
 [   2   11   22 2531   13]
 [  19   22   14   29  423]]
Accuracy of the model:  0.9242990654205607


# LightGBM

In [34]:
clf = LGBMClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(acc)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5100
[LightGBM] [Info] Number of data points in the train set: 17117, number of used features: 20
[LightGBM] [Info] Start training from score -2.982788
[LightGBM] [Info] Start training from score -2.298329
[LightGBM] [Info] Start training from score -2.207737
[LightGBM] [Info] Start training from score -0.481201
[LightGBM] [Info] Start training from score -2.112524
0.9999415785476427
0.9210280373831776
