In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import torch

In [2]:
# 载入数据
data = pd.read_csv("classification_with_GIS.csv")  # 替换为您的数据集文件路径
data = data[['details','classification', 'pixel_value', 'suitability']]

In [3]:
# 删除包含缺失值的行
data = data.dropna()
# 加载BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
details = data['details']
labels = data['classification']

In [5]:
# 将文本数据转换成词向量
vectors = []
for detail in details:
    encoded = tokenizer.encode_plus(
        detail,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    with torch.no_grad():
        output = model(**encoded)[0][:, 0, :].squeeze().numpy()
        vectors.append(output)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
vectors = pd.DataFrame(vectors)

In [7]:
vectors['pixel_value'] = data['pixel_value'].values.tolist()

In [8]:
# 训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(vectors.values, labels)
rf_preds = rf_model.predict(vectors.values)
print('Random Forest Classification Report:')
print(classification_report(labels, rf_preds))

Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 1       0.73      0.61      0.66       406
     Class 2       0.81      0.75      0.78      1308
     Class 3       0.70      0.53      0.61       580
     Class 4       0.78      0.89      0.83      2115

    accuracy                           0.78      4409
   macro avg       0.76      0.70      0.72      4409
weighted avg       0.77      0.78      0.77      4409



In [9]:
vectors['suitability'] = data['suitability'].values.tolist()
X = vectors.drop('pixel_value',axis=1)
X.replace({"--":"0"},inplace=True)
# 训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X.values, labels)
rf_preds = rf_model.predict(X.values)
print('Random Forest Classification Report:')
print(classification_report(labels, rf_preds))

Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 1       0.75      0.60      0.66       406
     Class 2       0.82      0.75      0.78      1308
     Class 3       0.69      0.54      0.60       580
     Class 4       0.78      0.90      0.83      2115

    accuracy                           0.78      4409
   macro avg       0.76      0.69      0.72      4409
weighted avg       0.78      0.78      0.77      4409



In [10]:
vectors.replace({"--":"0"},inplace=True)
# 训练随机森林模型
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(vectors.values, labels)
rf_preds = rf_model.predict(vectors.values)
print('Random Forest Classification Report:')
print(classification_report(labels, rf_preds))

Random Forest Classification Report:
              precision    recall  f1-score   support

     Class 1       0.78      0.57      0.66       406
     Class 2       0.81      0.75      0.78      1308
     Class 3       0.71      0.52      0.60       580
     Class 4       0.77      0.90      0.83      2115

    accuracy                           0.78      4409
   macro avg       0.77      0.69      0.72      4409
weighted avg       0.78      0.78      0.77      4409

