# 真假新闻特征识别和分类预测（代码整理）

## （一）数据清洗

In [None]:
import pandas as pd
import numpy as np

# 读取数据
df = pd.read_csv('Twitter_Analysis.csv',encoding="utf-8", index_col=0)
df.head()

# 查看数据基础信息
df.shape
df.info()

# 数据清洗
# 缺失值处理
df.isnull().sum()

# 重复值处理
df.duplicated().sum()

# 删除不需要的列
columns_to_drop = [
    'ORG percent', 'NORP percent', 'GPE percent', 'PERSON percent',
    'MONEY percent', 'DATA percent', 'CARDINAL percent', 'PERCENT percent',
    'ORDINAL percent', 'LAW percent', 'PRODUCT percent', 'EVENT percent',
    'TIME percent', 'LOC percent', 'WORK OF ART percent', 'QUANTITY percent', 'LANGUAGE percent'
]

df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# 查看清洗后的数据
df.shape

## （二）标题维度特征分析

In [None]:
# 标题关键词词云分析

from pyspark.sql import SparkSession
from pyspark import SparkContext.Sparkconf
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 初始化SparkSession
spark = SparkSession.builder.appName("TitleDataCleaning").getOrCreate()

# 从Spark读取数据
df= spark.read.csv("data/Twitter_Analysis.csv",head=True,inferSchema=True)

# 停用词集合
stop_words = set(spark.sparkContext.broadcast(ENGLISH_STOP_WORDS))

# 定义UDF去除停用词
def remove_stopwords(statement):
    words = statement.split()
    filtered_words = [word for word in words if word.lower() not in stop_words.value]
    return ' '.join(filtered_words)

remove_stopwords_udf = udf(remove_stopwords, StringType())

# 添加一个新列，去除停用词
df = df.withColumn('statement_without_stopwords', remove_stopwords_udf(df['statement']))

# 根据标签分割数据
df_label_F = df.filter(df['BinaryNumTarget'] == 0)
df_label_T = df.filter(df['BinaryNumTarget'] == 1)

# 收集数据到driver端
statment_label_F = df_label_F.select('statement_without_stopwords').rdd.flatMap(lambda x: x).collect()
statment_label_T = df_label_T.select('statement_without_stopwords').rdd.flatMap(lambda x: x).collect()

# 生成词云
# 生成标签为0的假新闻标题关键词词云
wc_label_F = WordCloud(background_color="black", max_words=100, max_font_size=256, random_state=42, width=2000, height=2000)
wc_label_F.generate(' '.join(statment_label_F))

# 显示标签为0的假新闻标题关键词词云
plt.imshow(wc_label_F, interpolation="bilinear")
plt.title('Word Cloud for Label 0(FALSE)')
plt.axis('off')
plt.show()

# 生成标签为1的真新闻标题关键词词云
wc_label_T = WordCloud(background_color="black", max_words=100, max_font_size=256, random_state=42, width=2000, height=2000)
wc_label_T.generate(' '.join(statment_label_T))

# 显示标签为1的真新闻标题关键词词云
plt.imshow(wc_label_T, interpolation="bilinear")
plt.title('Word Cloud for Label 1(TRUE)')
plt.axis('off')
plt.show()

# 停止SparkSession
spark.stop()

## （三）内容维度特征分析

In [None]:
# 文章长短比较

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
import matplotlib.pyplot as plt

# 初始化SparkSession
spark = SparkSession.builder.appName("HistogramExample").getOrCreate()

# 计算每个"Word count"和"majority_target"组合的计数
grouped_df = df.groupBy("Word count", "majority_target").count()

# 收集数据到driver端
data = grouped_df.collect()

# 初始化matplotlib的图和轴
fig, ax = plt.subplots()

# 为每个majority_target绘制直方图
for target in df.select("majority_target").distinct().collect():
    target_data = [row for row in data if row['majority_target'] == target[0]]
    ax.hist([row['Word count'] for row in target_data], bins=100, label=target[0], alpha=0.5)

# 设置图表标题和标签
ax.set_title('Distribution of Article Length')
ax.set_xlabel('Word count')
ax.set_ylabel('Count')

# 显示图例
ax.legend()

# 显示图表
plt.show()

# 停止SparkSession
spark.stop()

In [None]:
# 词性、标点分布比较

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum
import matplotlib.pyplot as plt
import pandas as pd

# 初始化SparkSession
spark = SparkSession.builder.appName("BarChartExample").getOrCreate()

# 选择相关的词性列
pos_columns = ['present_verbs', 'past_verbs', 'adjectives', 'adverbs', 'adpositions', 'pronouns', 'TOs', 'determiners', 'conjunctions']

# 选择相关的标点列
punct_columns = ['dots', 'exclamation', 'questions', 'ampersand']

# 准备数据：将真新闻和假新闻的词性和标点数据合并到一个 DataFrame 中
pos_data = df[pos_columns + ['BinaryNumTarget']].groupBy('BinaryNumTarget').pivot('Category').sum('Count')
punct_data = df[punct_columns + ['BinaryNumTarget']].groupBy('BinaryNumTarget').pivot('Category').sum('Count')

# 将PySpark DataFrame转换为Pandas DataFrame，以便使用matplotlib绘制图表
pos_data_pd = pos_data.toPandas()
punct_data_pd = punct_data.toPandas()

# 重置索引，以便绘制条形图
pos_data_pd = pos_data_pd.reset_index()
punct_data_pd = punct_data_pd.reset_index()

# 创建词性条形图
fig_pos, ax_pos = plt.subplots()
pos_data_pd.groupby(['Category', 'BinaryNumTarget']).sum().unstack().plot(kind='bar', ax=ax_pos, color=['red', 'blue'])
ax_pos.set_title('真假新闻的词性分布')
ax_pos.set_xlabel('词性')
ax_pos.set_ylabel('次数')

# 创建标点条形图
fig_punct, ax_punct = plt.subplots()
punct_data_pd.groupby(['Category', 'BinaryNumTarget']).sum().unstack().plot(kind='bar', ax=ax_punct, color=['red', 'blue'])
ax_punct.set_title('真假新闻的标点分布')
ax_punct.set_xlabel('标点')
ax_punct.set_ylabel('次数')

# 显示条形图
plt.show()

# 停止SparkSession
spark.stop()

## （四）社会影响力维度特征分析

In [None]:
# 社会影响力比较

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean
import numpy as np
import matplotlib.pyplot as plt
from math import pi

# 初始化 Spark Session
spark = SparkSession.builder.appName("RadarChart").getOrCreate()

# 选择真假新闻的数据
true_news = df.filter(col('BinaryNumTarget') == 1)
false_news = df.filter(col('BinaryNumTarget') == 0)

# 计算真假新闻的平均值
true_news_avg = true_news.agg(*(mean(col(c)).alias(c) for c in ['followers_count', 'friends_count', 'favourites_count', 'normalize_influence',
                                                                 'mentions', 'quotes', 'replies', 'retweets', 'favourites'])).collect()[0]
false_news_avg = false_news.agg(*(mean(col(c)).alias(c) for c in ['followers_count', 'friends_count', 'favourites_count', 'normalize_influence',
                                                                   'mentions', 'quotes', 'replies', 'retweets', 'favourites'])).collect()[0]

# 将数据转换为 NumPy 数组并规范化到[0,1]区间
true_news_avg = np.array([true_news_avg[i] for i in true_news_avg.keys()])
false_news_avg = np.array([false_news_avg[i] for i in false_news_avg.keys()])
max_values = np.max(np.vstack([true_news_avg, false_news_avg]), axis=0)
true_news_avg = true_news_avg / max_values
false_news_avg = false_news_avg / max_values

# 特征标签
labels = list(true_news_avg.keys())

# 计算角度
angles = np.linspace(0, 2 * pi, len(labels), endpoint=False).tolist()

# 数据闭环
true_news_avg = np.concatenate((true_news_avg, [true_news_avg[0]]))
false_news_avg = np.concatenate((false_news_avg, [false_news_avg[0]]))
angles += angles[:1]

# 初始化雷达图
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

# 绘制真假新闻的雷达图
ax.fill(angles, true_news_avg, color='blue', alpha=0.25)
ax.fill(angles, false_news_avg, color='red', alpha=0.25)
ax.plot(angles, true_news_avg, color='blue', linewidth=2, label='真新闻')
ax.plot(angles, false_news_avg, color='red', linewidth=2, label='假新闻')

# 设置雷达图的特征标签
ax.set_yticklabels([])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)

# 添加图例
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# 显示雷达图
plt.show()

# 停止 Spark Session
spark.stop()

## （五）情感维度特征分析

In [None]:
# 情感极性特征分布比较

import mysql.connector
import pandas as pd
import plotly.figure_factory as ff
import plotly.express as p
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# 连接到 MySQL 数据库
def get_texts_from_db():
    # 连接到 MySQL 数据库
    conn = mysql.connector.connect(
        host='127.0.0.1',
        user='root',
        password='123456',
        database='fakenews'
    )

    cursor = conn.cursor()

    cursor.execute("SELECT tweet, majority_target FROM tweets_data")
    data = cursor.fetchall()  # 获取所有文本和标签

    conn.close()

    return [(tweets_data[0], tweets_data[1]) for tweets_data in data]

# 进行情感分析并返回每个文本的情感得分
def analyze_sentiment(texts):
    sid = SentimentIntensityAnalyzer()
    analysis_results = []

    for text, label in texts:
        sentiment_score = sid.polarity_scores(text)  # 获取情感得分
        sentiment_compound = sentiment_score['compound']  # 取 compound 得分作为情感得分

        analysis_results.append({
            'text': text,
            'label': label,
            'polarity': sentiment_compound
        })

    return analysis_results


# 生成情感分布图
def plot_sentiment_distribution(df):
    # 获取 TRUE 和 FAKE 标签的情感得分
    x1 = df.loc[df['label'] == 'TRUE']['polarity']
    x2 = df.loc[df['label'] == 'FALSE']['polarity']

    # 设置组标签和颜色
    group_labels = ['TRUE', 'FAKE']
    colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']

    # 创建分布图
    fig = ff.create_distplot([x1, x2], group_labels, colors=colors)
    fig.update_layout(title_text='Polarity Distribution for TRUE vs FAKE', template="plotly_white")
    fig.show()

# 生成情感分布的小提琴图
def plot_sentiment_distribution2(df):
    # 使用 Plotly 生成小提琴图
    fig = p.violin(df, y='polarity', color="label",
                   violinmode='overlay',   # 设置为叠加模式
                   template='plotly_white', # 设置模板为白色背景
                   title="Sentiment Distribution for TRUE vs FAKE",  # 添加图表标题
                   labels={"polarity": "Sentiment Score", "label": "News Type"})  # 设置轴标签

    fig.update_traces(meanline_visible=True)  # 显示小提琴图的中位数
    fig.show()  # 显示图表

# 从数据库中获取文本和标签
texts_from_db = get_texts_from_db()

# 对获取的文本进行情感分析
analysis_results = analyze_sentiment(texts_from_db)

# 将情感分析结果转化为 DataFrame
df = pd.DataFrame(analysis_results)

# 绘制情感分布图
plot_sentiment_distribution(df)
plot_sentiment_distribution2(df)

## （六）可信度维度特征分析

In [None]:
# 可信度评分分布比较

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import matplotlib.pyplot as plt
import numpy as np

# 初始化SparkSession
spark = SparkSession.builder.appName("CredibilityScoreAnalysis").getOrCreate()

# 将真新闻和假新闻的可信度评分分别提取出来
true_news = df.filter(col("BinaryNumTarget") == 1).select("cred").rdd.flatMap(lambda x: x).collect()
fake_news = df.filter(col("BinaryNumTarget") == 0).select("cred").rdd.flatMap(lambda x: x).collect()

# 设置图形大小
plt.figure(figsize=(12, 6))

# 创建直方图来显示真新闻和假新闻的可信度评分分布
plt.subplot(1, 2, 1)
plt.hist(true_news, bins=30, density=True, alpha=0.5, color='green', label='True News')
plt.hist(fake_news, bins=30, density=True, alpha=0.5, color='red', label='Fake News')
plt.title('Distribution of Credibility Scores by News Type (Histogram)')
plt.xlabel('Credibility Score')
plt.ylabel('Density')
plt.legend()

# 创建密度图来显示真新闻和假新闻的可信度评分分布
plt.subplot(1, 2, 2)
sns.kdeplot(true_news, shade=True, color='green', label='True News')
sns.kdeplot(fake_news, shade=True, color='red', label='Fake News')
plt.title('Density Plot of Credibility Scores by News Type')
plt.xlabel('Credibility Score')
plt.ylabel('Density')
plt.legend()

# 调整子图间距
plt.tight_layout()

# 显示图形
plt.show()

# 停止SparkSession
spark.stop()

In [None]:
# 可信度评分与其他特征的关系比较

from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import matplotlib.pyplot as plt
import numpy as np

# 初始化SparkSession
spark = SparkSession.builder.appName("FeatureCorrelationAnalysis").getOrCreate()

# 选择数值型特征列进行相关性分析
numeric_features = [
    'cred', 'followers_count', 'friends_count', 'favourites_count',
    'statuses_count', 'listed_count', 'BotScore', 'normalize_influence',
    'quotes', 'replies', 'retweets', 'favourites', 'hashtags'
]

# 使用PySpark的ML库计算皮尔逊相关系数矩阵
r1 = Correlation.corr(df, numeric_features, "pearson").head()

# 提取相关系数矩阵
pearson_corr = np.array(r1[0]).reshape(len(numeric_features), len(numeric_features))

# 绘制热力图
plt.figure(figsize=(20, 15))  # 设置图像大小
sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', linewidths=0.5, 
            linecolor='white', square=True)
plt.title('Correlation Heatmap between Features and Credibility Score')
plt.show()

# 停止SparkSession
spark.stop()

## （七）情感实时处理维度

In [None]:
# kafka生产者配置

import mysql.connector
import pandas as pd
import json
from kafka import KafkaProducer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime

# MySQL 数据库配置
DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASSWORD = '123456'
DB_NAME = 'fakenews'

# Kafka 配置
KAFKA_BROKER = 'localhost:9092'
KAFKA_TOPIC = 'news_topic'

# 配置情感分析器
sid = SentimentIntensityAnalyzer()

# 情感分析函数
def analyze_sentiment(text):
    sentiment_score = sid.polarity_scores(text)
    return sentiment_score['compound']

# 连接数据库：从 tweet_data 表读取数据
def fetch_data_from_db():
    conn = mysql.connector.connect(
        host=DB_HOST,
        user=DB_USER,
        password=DB_PASSWORD,
        database=DB_NAME
    )
    query = "SELECT tweet, timestamp FROM tweet_data"
    df = pd.read_sql(query, conn)
    conn.close()
    return df

# Kafka 生产者：将数据发送到 Kafka
def send_to_kafka():
    producer = KafkaProducer(
        bootstrap_servers=[KAFKA_BROKER],
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )
    df = fetch_data_from_db()
    for _, row in df.iterrows():
        sentiment_score = analyze_sentiment(row['tweet'])  # 获取情感得分
        news_data = {
            'tweet': row['tweet'],
            'timestamp': row['timestamp'].isoformat(),
            'sentiment_score': sentiment_score  # 加入情感得分
        }
        producer.send(KAFKA_TOPIC, value=news_data)
        print(f"Sent tweet to Kafka: {news_data['tweet']} | Sentiment: {sentiment_score}")

# 启动生产者
send_to_kafka()

In [None]:
# kafka消费者配置，进行实时情感计算和可视化

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import plotly.graph_objects as go
from datetime import datetime
import time

# Kafka 配置
KAFKA_BROKER = 'localhost:9092'
KAFKA_TOPIC = 'news_topic'

# Pyspark Streaming 配置
conf = SparkConf().setAppName("KafkaSparkSentimentAnalysis")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 5) # 批次间隔为5秒

# 从Kafka读取流数据
kafka_stream = KafkaUtils.createStream(ssc, KAFKA_BROKER, "consumer-group", {KAFKA_TOPIC: 1})

# 初始化
analyzer = SentimentIntensityAnalyzer()
timestamps = []
sentiment_scores = []
fig = go.Figure(
    data=[go.Scatter(x=timestamps, y=sentiment_scores, mode='lines+markers')],
    layout=go.Layout(
        title="实时情感分析",
        xaxis_title="时间",
        yaxis_title="情感极性"
    )
)

# 情感分析函数
def analyze_sentiment(rdd):
    global timestamps, sentiment_scores
    if not rdd.isEmpty():
        json_data = rdd.map(lambda x: json.loads(x[1]))
        
        for record in json_data.collect():
            timestamp = record['timestamp']
            sentiment_score = analyzer.polarity_scores(record['text'])['compound']
            timestamps.append(datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S'))
            sentiment_scores.append(sentiment_score)
            
            fig.data[0].x = timestamps
            fig.data[0].y = sentiment_scores
            fig.update_layout(
                title="实时情感分析",
                xaxis_title="时间",
                yaxis_title="情感极性"
            )
            fig.show()

# 从Kafka流中获取消息并分析情感
kafka_stream.map(lambda x: x[1]).foreachRDD(analyze_sentiment)

# 启动流计算
ssc.start()

# 等待终止并在300秒（5分钟）后停止
time.sleep(300)

# 停止流处理
ssc.stop(stopSparkContext=True)

## （八）机器学习模型构建

In [None]:
# 模型训练

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate

# 加载数据集并设置变量
features = ['cred', 'BotScore', 'tweet']
target = 'BinaryNumTarget'
data = pd.read_csv('.../database.csv')
X = data[features]
y = data[target]

# 将数据拆分成训练集、验证集和测试集（70:15:15）
train_data, temp_data, train_labels, temp_labels = train_test_split(X, y, test_size=0.3, stratify=y, random_state=2018)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, stratify=temp_labels, random_state=2018)

# 对推文文本进行分词和填充处理
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['tweet'])
train_sequences = tokenizer.texts_to_sequences(train_data['tweet'])
val_sequences = tokenizer.texts_to_sequences(val_data['tweet'])
test_sequences = tokenizer.texts_to_sequences(test_data['tweet'])

max_sequence_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
val_padded = pad_sequences(val_sequences, maxlen=max_sequence_length)
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

# 分别提取文本特征和数值特征
train_text_features = train_padded
val_text_features = val_padded
test_text_features = test_padded

train_numeric_features = train_data[['cred', 'BotScore']].values
val_numeric_features = val_data[['cred', 'BotScore']].values
test_numeric_features = test_data[['cred', 'BotScore']].values

# 定义输入层：文本输入和数值输入
text_input = Input(shape=(max_sequence_length,), name='text_input')
numeric_input = Input(shape=(2,), name='numeric_input')

# 定义文本处理部分（使用 Bi-LSTM 处理文本）
embedding_layer = Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length)(text_input)
lstm_layer = LSTM(128, return_sequences=True)(embedding_layer)
lstm_layer = LSTM(64)(lstm_layer)
dropout_layer = Dropout(0.5)(lstm_layer)

# 合并文本特征和数值特征
combined = Concatenate()([dropout_layer, numeric_input])

# 结合后的特征传入全连接层
dense_layer = Dense(64, activation='relu')(combined)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

# 定义模型
model = Model(inputs=[text_input, numeric_input], outputs=output_layer)

# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit([train_text_features, train_numeric_features], train_labels, epochs=5, batch_size=32, validation_data=([val_text_features, val_numeric_features], val_labels))

In [None]:
# 模型测试和结果可视化

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Concatenate
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 特征列和目标列
features = ['cred', 'BotScore', 'tweet']
target = 'BinaryNumTarget'

# 读取数据
data = pd.read_csv('C:/Users/LRQ/Desktop/database.csv')
X = data[features]
y = data[target]

# 数据集拆分
train_data, temp_data, train_labels, temp_labels = train_test_split(X, y, test_size=0.3, stratify=y, random_state=2018)
val_data, test_data, val_labels, test_labels = train_test_split(temp_data, temp_labels, test_size=0.5, stratify=temp_labels, random_state=2018)

# 对推文文本进行分词和填充处理
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['tweet'])
train_sequences = tokenizer.texts_to_sequences(train_data['tweet'])
val_sequences = tokenizer.texts_to_sequences(val_data['tweet'])
test_sequences = tokenizer.texts_to_sequences(test_data['tweet'])

max_sequence_length = 100
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length)
val_padded = pad_sequences(val_sequences, maxlen=max_sequence_length)
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length)

# 分别提取文本特征和数值特征
train_text_features = train_padded
val_text_features = val_padded
test_text_features = test_padded

train_numeric_features = train_data[['cred', 'BotScore']].values
val_numeric_features = val_data[['cred', 'BotScore']].values
test_numeric_features = test_data[['cred', 'BotScore']].values

# 加载保存的模型
model = tf.keras.models.load_model('twitter_analysis_model.h5')

# 对测试数据进行预测
predictions = model.predict([test_text_features, test_numeric_features])
predicted_labels = np.round(predictions)  # 四舍五入，得到预测标签

# 计算性能指标
accuracy = accuracy_score(test_labels, predicted_labels)  # 准确率
f1 = f1_score(test_labels, predicted_labels)  # F1分数
recall = recall_score(test_labels, predicted_labels)  # 召回率
precision = precision_score(test_labels, predicted_labels)  # 精确度

# 打印性能指标
print(f'准确率: {accuracy:.4f}')
print(f'F1 分数: {f1:.4f}')
print(f'召回率: {recall:.4f}')
print(f'精确度: {precision:.4f}')

# 创建混淆矩阵
cm = confusion_matrix(test_labels, predicted_labels)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
    for j in range(ncols):
        c = cm[i, j]
        p = cm_normalized[i, j]
        s = f'{c}\n({p:.2%})'
        annot[i, j] = s

# 绘制混淆矩阵
plt.figure(figsize=(10, 7))
ax = sns.heatmap(cm_normalized, annot=annot, fmt='', cmap='Blues', cbar=False)
ax.set_xlabel('预测标签')  # 设置x轴标签
ax.set_ylabel('真实标签')  # 设置y轴标签
ax.set_title('混淆矩阵')  # 设置标题
class_names = ['类别 {}'.format(i) for i in range(cm.shape[0])]  # 类别名称
ax.xaxis.set_ticklabels(class_names)
ax.yaxis.set_ticklabels(class_names)

plt.show()