In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd "/content/gdrive/MyDrive/Text Classification/"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import jieba
import re

In [None]:
dataset = pd.read_csv('20200913-Top10')

In [None]:
def clean_data(sentence):
    clean_punc_text = re.sub(r'[^\w\s]','',sentence).replace(" ","").strip()
    clean_punc_num_text = re.sub(r'[0-9]+','',clean_punc_text)
    clean_punc_num_eng_text = re.sub(r'[a-zA-Z]+', '', clean_punc_num_text) 
    return clean_punc_num_eng_text

dataset["新闻标题"] = dataset["新闻标题"].apply(clean_data)
dataset.head()

Unnamed: 0,新闻标题,一级类目
0,南宁市交通运输局到广州市开展枢纽经济发展调研,时政
1,加州宣布独立纽约州宣布取消总统初选现在的美国到底有多乱,时政
2,曾是凯迪拉克的专属标签远光狗的克星为什么却火不起来呢,汽车
3,赵丽颖冯绍峰终于秀恩爱穿镂空白裤秀小蛮腰冯绍峰清爽帅气,娱乐
4,贵州一男子厕所杀鱼不料鱼钻进便池孔他伸手去捞被咬住了,社会


In [None]:
def jieba_tokenize(sentence):
    return " ".join([word for word in jieba.cut(sentence)])

dataset["新闻标题"] = dataset["新闻标题"].apply(jieba_tokenize)
dataset.head()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.785 seconds.
Prefix dict has been built successfully.


Unnamed: 0,新闻标题,一级类目
0,南宁市 交通运输 局到 广州市 开展 枢纽 经济 发展 调研,时政
1,加州 宣布独立 纽约州 宣布 取消 总统 初选 现在 的 美国 到底 有多乱,时政
2,曾 是 凯迪拉克 的 专属 标签 远光 狗 的 克星 为什么 却 火 不 起来 呢,汽车
3,赵丽颖 冯绍峰 终于 秀 恩爱 穿 镂空 白裤 秀小蛮 腰 冯绍峰 清爽 帅气,娱乐
4,贵州 一 男子 厕所 杀鱼 不料 鱼 钻进 便池 孔 他 伸手 去 捞 被 咬住 了,社会


In [None]:
dataset['新闻标题'] = dataset['新闻标题'].replace('', np.nan)
dataset.dropna(subset=['新闻标题'],inplace=True)

In [None]:
mapping = {"社会":"社","娱乐":"娱","时政":"政","汽车":"车","健康":"健","教育":"教","财经":"财","科技":"科","体育":"体","文化":"文"}
dataset["一级类目"].replace(mapping, inplace=True)
dataset["一级类目"].value_counts()

社    128675
娱     98214
政     95547
车     52556
健     50047
教     43090
财     42450
科     38205
体     33684
文     28922
Name: 一级类目, dtype: int64

In [None]:
sequences = dataset["新闻标题"].tolist()
word_sequences = [sentence.split(" ") for sentence in sequences]
labels = dataset["一级类目"].tolist()

In [None]:
from sklearn.model_selection import train_test_split

train_X, valid_test_X, train_y, valid_test_y = train_test_split(word_sequences,labels,test_size=0.2,stratify=labels, random_state=42)
print('shape of train_X:', len(train_X))
print('shape of valid_test_X:', len(valid_test_X))
print('shape of train_y:', len(train_y))
print('shape of valid_test_y:', len(valid_test_y))

shape of train_X: 489112
shape of valid_test_X: 122278
shape of train_y: 489112
shape of valid_test_y: 122278


In [None]:
valid_X, test_X, valid_y, test_y = train_test_split(valid_test_X,valid_test_y,test_size=0.5,stratify=valid_test_y, random_state=42)
print('shape of valid_X:', len(valid_X))
print('shape of test_X:', len(test_X))
print('shape of valid_y:', len(valid_y))
print('shape of test_y:', len(test_y))

shape of valid_X: 61139
shape of test_X: 61139
shape of valid_y: 61139
shape of test_y: 61139


### 生成NeuralClassifier输入训练集,验证集和测试集

In [None]:
train_json_dictionary = []
for word_sequence, label in zip(train_X,train_y):
    dictionary = {"doc_label": label, "doc_token": word_sequence, "doc_keyword": [], "doc_topic": []}
    train_json_dictionary.append(dictionary)

In [None]:
valid_json_dictionary = []
for word_sequence, label in zip(valid_X,valid_y):
    dictionary = {"doc_label": label, "doc_token": word_sequence, "doc_keyword": [], "doc_topic": []}
    valid_json_dictionary.append(dictionary)

In [None]:
test_json_dictionary = []
for word_sequence, label in zip(test_X,test_y):
    dictionary = {"doc_label": label, "doc_token": word_sequence, "doc_keyword": [], "doc_topic": []}
    test_json_dictionary.append(dictionary)

In [None]:
with open("news_train.json", "w") as outfile:
  for unit_data in train_json_dictionary:
    json.dump(unit_data, outfile, ensure_ascii=False)
    outfile.write('\n') 

In [None]:
with open("news_dev.json", "w") as outfile:
  for unit_data in valid_json_dictionary:
    json.dump(unit_data, outfile, ensure_ascii=False)
    outfile.write('\n') 

In [None]:
with open("news_test.json", "w") as outfile:
  for unit_data in test_json_dictionary:
    json.dump(unit_data, outfile, ensure_ascii=False)
    outfile.write('\n') 

### 生成BERT输入训练集和测试集

In [None]:
bert_train_datas = [','.join(word_list).replace(',',' ') for word_list in train_X]
bert_train_labels = train_y
bert_test_datas = [','.join(word_list).replace(',',' ') for word_list in test_X]
bert_test_labels = test_y
bert_train = [bert_train_label + "\t" + bert_train_data for (bert_train_label,bert_train_data) in zip(bert_train_labels,bert_train_datas)]
bert_test = [bert_test_label + "\t" + bert_test_data for (bert_test_label,bert_test_data) in zip(bert_test_labels,bert_test_datas)]

In [None]:
with open("news.label", "w") as outfile:
  for unit_data in bert_train:
    outfile.write(unit_data)
    outfile.write('\n') 

In [None]:
with open("news.train", "w") as outfile:
  for unit_data in bert_train:
    outfile.write(unit_data)
    outfile.write('\n') 

In [None]:
with open("news.test", "w") as outfile:
  for unit_data in bert_test:
    outfile.write(unit_data)
    outfile.write('\n') 