In [1]:
import pandas as pd
import os
from sklearn.model_selection import (train_test_split)

# 定义一个函数来读取txt文件并转换为DataFrame
def read_text_file(file_path, sep='\t'):
    data = pd.read_csv(file_path, sep=sep, header=None, names=['column1', 'column2', '...']) # 根据你的数据文件结构调整列名
    return data

# 读取原始txt文件
# 假设txt文件中的数据是以制表符分隔的，且没有表头
data = read_text_file('./cnews.train.txt')

# 随机分割数据集为训练集、验证集和测试集（按比例 8:1:1）
train, temp = train_test_split(data, test_size=0.2, random_state=42)
valid, test = train_test_split(temp, test_size=0.5, random_state=42)

# 将DataFrame保存为txt文件的函数
def save_dataframe_to_txt(dataframe, file_path, sep='\t'):
    dataframe.to_csv(file_path, sep=sep, index=False, header=False)

# 将分割后的数据保存为新的txt文件
save_dataframe_to_txt(train, 'train.txt')
save_dataframe_to_txt(valid, 'valid.txt')
save_dataframe_to_txt(test, 'test.txt')

In [14]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report



%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [15]:
# 设置数据读取、模型、结果保存路径
base_dir = './'
train_dir = os.path.join(base_dir, 'train.txt')
test_dir = os.path.join(base_dir, 'test.txt')
val_dir = os.path.join(base_dir, 'val.txt')
#vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = 'checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')

In [18]:
def read_file(filename):
    """读取文件数据"""
    contents, labels = [], []
    with open(filename,encoding='utf-8') as f:
        for line in f:
            try:
                label, content = line.strip().split('\t')
                if content:
                    contents.append((content))
                    labels.append(label)
            except:
                pass
    return contents, labels


In [19]:
train_contents, train_labels = read_file(train_dir)
test_contents, test_labels = read_file(test_dir)
val_counts = Counter(train_labels)
val_counts

Counter({'科技': 4045,
         '游戏': 4023,
         '家居': 4015,
         '教育': 4014,
         '财经': 4005,
         '房产': 3992,
         '体育': 3991,
         '时政': 3979,
         '时尚': 3970,
         '娱乐': 3966})

In [22]:
import re
#去除文本中的表情字符（只保留中英文和数字）
def clear_character(sentence):
    pattern1= '\[.*?\]'     
    pattern2 = re.compile('[^\u4e00-\u9fa5^a-z^A-Z^0-9]')   
    line1=re.sub(pattern1,'',sentence)
    line2=re.sub(pattern2,'',line1)   
    new_sentence=''.join(line2.split()) #去除空白
    return new_sentence

In [23]:
train_text=list(map(lambda s: clear_character(s), train_contents))
test_text=list(map(lambda s: clear_character(s), test_contents))

In [24]:
import jieba
train_seg_text=list(map(lambda s: jieba.lcut(s), train_text))
test_seg_text=list(map(lambda s: jieba.lcut(s), test_text))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.470 seconds.
Prefix dict has been built successfully.


In [28]:

stop_words_path = "./stop.txt"
def get_stop_words():
    file = open(stop_words_path, 'rb').read().decode('gbk').split('\r\n')
    return set(file)
stopwords = get_stop_words()


In [29]:
# 去掉文本中的停用词
def drop_stopwords(line, stopwords):
    line_clean = []
    for word in line:
        if word in stopwords:
            continue
        line_clean.append(word)
    return line_clean

In [30]:
train_st_text=list(map(lambda s: drop_stopwords(s,stopwords), train_seg_text))
test_st_text=list(map(lambda s: drop_stopwords(s,stopwords), test_seg_text))

In [31]:
le = LabelEncoder()
le.fit(train_labels)
LabelEncoder()

In [32]:
label_train_id=le.transform(train_labels)
label_test_id=le.transform(test_labels)

In [33]:
train_c_text=list(map(lambda s: ' '.join(s), train_st_text))
test_c_text=list(map(lambda s: ' '.join(s), test_st_text))

In [34]:
tfidf_model = TfidfVectorizer(binary=False,token_pattern=r"(?u)\b\w+\b")
train_Data = tfidf_model.fit_transform(train_c_text)
test_Data = tfidf_model.transform(test_c_text)

In [36]:
from sklearn.linear_model import LogisticRegression
'''LR模型分类训练'''
classifier=LogisticRegression()
classifier.fit(train_Data, label_train_id)
pred = classifier.predict(test_Data)
from sklearn.metrics import classification_report
print(classification_report(label_test_id, pred,digits=4))

              precision    recall  f1-score   support

           0     0.9960    0.9960    0.9960       496
           1     0.9824    0.9767    0.9796       515
           2     0.9554    0.9724    0.9638       507
           3     0.9617    0.9638    0.9627       469
           4     0.9587    0.9347    0.9466       521
           5     0.9722    0.9795    0.9758       536
           6     0.9475    0.9611    0.9542       488
           7     0.9938    0.9777    0.9857       493
           8     0.9418    0.9577    0.9497       473
           9     0.9577    0.9482    0.9530       502

    accuracy                         0.9668      5000
   macro avg     0.9667    0.9668    0.9667      5000
weighted avg     0.9669    0.9668    0.9668      5000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
