<a href="https://colab.research.google.com/github/450586509/Knowledge/blob/master/event_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as tfs
import warnings

warnings.filterwarnings('ignore')



In [4]:
dev_url = "http://129.204.205.246/event_entity_dev_data.csv"
train_url = "http://129.204.205.246/event_entity_train_data_label.csv"
train_df = pd.read_csv(train_url, delimiter='\t', header=None)
train_df.columns = ["id","text","event_name", "company_name"]
train_df = train_df.head(1000)

### 过滤长度大于500的行。

In [5]:
print(train_df.shape)
train_df["len"] = train_df.text.apply((lambda x: len(x)))
print(train_df.shape)
print(train_df.head)
for index, row in train_df.iterrows():
  print(f"""index={index} text={row.get("text")} len={len(row.get("text"))}""")
  if index > 5:
    break

(1000, 4)
(1000, 5)
<bound method NDFrame.head of           id  ...  len
0    2444634  ...   48
1    2836026  ...   60
2    2809128  ...   86
3    2221860  ...   62
4    2091205  ...   57
..       ...  ...  ...
995  2864864  ...   60
996  2581395  ...   33
997  2727028  ...   17
998  2636106  ...   88
999  2622581  ...  183

[1000 rows x 5 columns]>
index=0 text=世联君汇预计2017年净利下滑近8成至853万元中超电缆(002471)再遭中超集团减持5%股份 len=48
index=1 text=LG空调亏损严重或效仿新科 两大缺陷遭退市尴尬华兰生物(002007)三季度净利下降45% 汇添富或为“失血门”跑路主力 len=60
index=2 text=四方达(300179)股东减持60万股 套现414.6万元收到欧盟打款后希腊总理宣布辞职再选 欧盟紧急声明要求希腊恪守承诺巨力索具(002342)下调预测 因产品毛利率下降 len=86
index=3 text=单元式空调抽查不合格名单春兰、日立产品入列北大荒(600598)计提坏账净利骤降83.5%好利来(002729)股东高位减持 len=62
index=4 text=深航副总裁闪电辞职 “股权之争”仍迷离(IPO路演)龙大肉食(002726)一季度净利“反常”下滑 路演现场遭质疑 len=57
index=5 text=ST皇台(000995)总经理辞职销售公司总经理接任电连技术的“一年之痒”:高管辞职业绩变脸股价破发逾三成 len=53
index=6 text=国润新材因2017年报扔未披露被提示摘牌风险安德利头顶电商压力业绩下坡 自营模式存隐患中小板三公司发布高管辞职公告 len=57


In [6]:
train_df = train_df[train_df.len < 500]
print(f"""max_len={train_df["len"].max()}""")
print(f"""mean_len={train_df["len"].mean()}""")
print(f"shape={train_df.shape}")
train_df.head()

max_len=183
mean_len=62.41
shape=(1000, 5)


Unnamed: 0,id,text,event_name,company_name,len
0,2444634,世联君汇预计2017年净利下滑近8成至853万元中超电缆(002471)再遭中超集团减持5%股份,业绩下滑,世联君汇,48
1,2836026,LG空调亏损严重或效仿新科 两大缺陷遭退市尴尬华兰生物(002007)三季度净利下降45% ...,业绩下滑,华兰生物,60
2,2809128,四方达(300179)股东减持60万股 套现414.6万元收到欧盟打款后希腊总理宣布辞职再选...,业绩下滑,巨力索具,86
3,2221860,单元式空调抽查不合格名单春兰、日立产品入列北大荒(600598)计提坏账净利骤降83.5%好...,业绩下滑,北大荒,62
4,2091205,深航副总裁闪电辞职 “股权之争”仍迷离(IPO路演)龙大肉食(002726)一季度净利“反常...,业绩下滑,龙大肉食,57


In [7]:
event_counts = train_df.event_name.value_counts()
print(f"type of event_counts = {type(event_counts)}")
event_counts

type of event_counts = <class 'pandas.core.series.Series'>


业绩下滑    834
提现困难    166
Name: event_name, dtype: int64

In [8]:
print(f"样本最少为{event_counts.min()}")
print(f"样本最多为{event_counts.max()}")
print(f"样本平均为{round(event_counts.mean(),4)}")

样本最少为166
样本最多为834
样本平均为500.0


In [9]:
#model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-chinese')
model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [10]:
print(tokenizer.encode("hello world 谢 稳 文", add_special_tokens=True))
print(tokenizer.encode("hello world 谢 稳 文", add_special_tokens=False))
#tokenizer.encode("hello world你好啊", add_special_tokens=True)

[101, 7592, 2088, 100, 100, 1861, 102]
[7592, 2088, 100, 100, 1861]


In [11]:
train_tokenized = train_df.text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
print(f"train_tokenized type = {type(train_tokenized)}")
train_max_len = 0
for i in train_tokenized.values:
    if len(i) > train_max_len:
        train_max_len = len(i)

train_padded = np.array([i + [0] * (train_max_len-len(i)) for i in train_tokenized.values])
print("train set shape:",train_padded.shape)


train_tokenized type = <class 'pandas.core.series.Series'>
train set shape: (1000, 182)


In [12]:
train_attention_mask = np.where(train_padded != 0, 1, 0)


In [13]:
train_input_ids = torch.tensor(train_padded).long()
train_attention_mask = torch.tensor(train_attention_mask).long()
with torch.no_grad():
    train_last_hidden_states = model(train_input_ids, attention_mask=train_attention_mask)

In [16]:
train_features = train_last_hidden_states[0][:,0,:].numpy()
train_labels = train_df.event_name

### 直接利用LogisticRegression模型训练。

In [19]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)
lr_clf.score(train_features, train_labels)

0.998