# 使用ALBERT进行tweets的情感分类


# 一、数据获取

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
path="/content/drive/My Drive/Colab Notebooks/NLP/05_ALBERT/"
data=pd.read_csv(path+"tweets.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,target,id,date,time,username,tweet
0,0,4,1238978214090792960,2020-03-14,23:59:59,ok32650586,I hope everything turns ok! Sending love and p...
1,48,4,1237965669649412097,2020-03-12,04:56:30,ok32650586,Wow !! They are BIG dogs like you sweetie pie!...
2,65,4,1237625848347234304,2020-03-11,06:26:10,ok32650586,That’s so wonderful!! They are angels from hea...
3,66,4,1237614943794413569,2020-03-11,05:42:51,ok32650586,Awwww these pictures are sooo adorable!! Kisse...
4,68,4,1237612806888800256,2020-03-11,05:34:21,ok32650586,She looks loved and happy to me! Kisses and hu...


## 1.1 处理数据格式

In [3]:
data.columns

Index(['Unnamed: 0', 'target', 'id', 'date', 'time', 'username', 'tweet'], dtype='object')

In [4]:
data.drop(columns=data.columns[0],inplace=True)
data.head()

Unnamed: 0,target,id,date,time,username,tweet
0,4,1238978214090792960,2020-03-14,23:59:59,ok32650586,I hope everything turns ok! Sending love and p...
1,4,1237965669649412097,2020-03-12,04:56:30,ok32650586,Wow !! They are BIG dogs like you sweetie pie!...
2,4,1237625848347234304,2020-03-11,06:26:10,ok32650586,That’s so wonderful!! They are angels from hea...
3,4,1237614943794413569,2020-03-11,05:42:51,ok32650586,Awwww these pictures are sooo adorable!! Kisse...
4,4,1237612806888800256,2020-03-11,05:34:21,ok32650586,She looks loved and happy to me! Kisses and hu...


In [5]:
data.target.value_counts()

4    24968
0    13480
Name: target, dtype: int64

In [6]:
data.loc[data['target'] ==4, 'target'] = 1
data.target.value_counts()

1    24968
0    13480
Name: target, dtype: int64

In [7]:
data.drop(columns=data.columns[1:-1],inplace=True)
print(data['tweet'][0])
data.head()

I hope everything turns ok! Sending love and prayers for a good outcome. 🙏❤️🐶❤️🌹


Unnamed: 0,target,tweet
0,1,I hope everything turns ok! Sending love and p...
1,1,Wow !! They are BIG dogs like you sweetie pie!...
2,1,That’s so wonderful!! They are angels from hea...
3,1,Awwww these pictures are sooo adorable!! Kisse...
4,1,She looks loved and happy to me! Kisses and hu...


In [8]:
data=data.sample(frac=1)
data.head()

Unnamed: 0,target,tweet
27660,1,"Yes, will be delightful to shower while cookin..."
29909,1,"No idea who this young man playing violin is, ..."
31999,0,"No, the Washington Post got it 100% correct. ..."
29026,0,she was My co editor of victims of gay bullyin...
37187,1,How about free med school for people who are w...


## 1.2 划分训练集和测试集

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( data["tweet"], data["target"], test_size=0.2, random_state=42)

In [27]:
len(X_test)

7690

# 二、ALBERT 模型

In [10]:
!pip install --upgrade tensorflow
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import numpy as np

Collecting tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/85/d4/c0cd1057b331bc38b65478302114194bd8e1b9c2bbc06e300935c0e93d90/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 41kB/s 
Collecting tensorflow-estimator<2.2.0,>=2.1.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/18/90/b77c328a1304437ab1310b463e533fa7689f4bfc41549593056d812fab8e/tensorflow_estimator-2.1.0-py2.py3-none-any.whl (448kB)
[K     |████████████████████████████████| 450kB 66.3MB/s 
Collecting tensorboard<2.2.0,>=2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/d9/41/bbf49b61370e4f4d245d4c6051dfb6db80cec672605c91b1652ac8cc3d38/tensorboard-2.1.1-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.9MB 61.9MB/s 
[31mERROR: tensorflow-federated 0.12.0 has requirement tensorflow-addons~=0.7.0, but you'll have tensorflow-addons 0.8.3 which is incompatible.[0m
Installing collected pa

## 2.1 编码和搭建模型的函数

In [0]:
def albert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    print(texts)
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [0]:
def build_model(albert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = albert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(2, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
    
    return model

## 2.2 加载预训练模型

In [0]:
module_url="https://tfhub.dev/tensorflow/albert_en_xlarge/1"
albert_layer = hub.KerasLayer(module_url, trainable=True)

In [14]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece
import tokenization

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |▎                               | 10kB 22.7MB/s eta 0:00:01[K     |▋                               | 20kB 30.6MB/s eta 0:00:01[K     |█                               | 30kB 34.8MB/s eta 0:00:01[K     |█▎                              | 40kB 38.2MB/s eta 0:00:01[K     |█▋                              | 51kB 36.7MB/s eta 0:00:01[K     |██                              | 61kB 38.7MB/s eta 0:00:01[K     |██▏                             | 71kB 31.0MB/s eta 0:00:01[K     |██▌                             | 81kB 32.2MB/s eta 0:00:01[K     |██▉                             | 92kB 33.9MB/s eta 0:00:01[K     |███▏                            | 102kB 32.1MB/s eta 0:00:01[K     |███▌                            | 112kB 32.1MB/s eta 0:00:01[K     |███▉           

In [0]:
#en albert of tf-hub
sp_model_file = albert_layer.resolved_object.sp_model_file.asset_path.numpy()
tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file)

In [16]:
tokenizer.tokenize("I hope everything turns ok! Sending love and prayers for a good outcome. 🙏❤️🐶❤️🌹")

['▁',
 'I',
 '▁hope',
 '▁everything',
 '▁turns',
 '▁ok',
 '!',
 '▁',
 'S',
 'ending',
 '▁love',
 '▁and',
 '▁prayers',
 '▁for',
 '▁a',
 '▁good',
 '▁outcome',
 '.',
 '▁',
 '🙏❤️🐶❤️🌹']

## 2.3 训练模型


对文本进行编码

In [21]:
train_input = albert_encode(X_train.astype(str), tokenizer, max_len=64)
test_input = albert_encode(X_test.astype(str), tokenizer, max_len=64)

8784     Impressive 2yo filly debut from Love Beach at ...
11320    Vote #moscowmitch OUT!!!! Kentucky stand up an...
35319    Don’t disrespect us by saying we’re selling ou...
22582    Janelle, Evil Dick or Keesha. They wouldn’t to...
10986    Maybe we should all make each other laugh. We'...
                               ...                        
15288    Tessa is Queen of the Knockouts? I like the so...
15426    Wishing you all the very best in everything yo...
18820    I can understand the shoot of anger he felt in...
28801    I am sick and tired of all this horoscope bull...
24822    Awww, you’re literally such a great moot! I lo...
Name: tweet, Length: 30758, dtype: object
11587    I try not to stir up the em fans too hard. Nee...
32588    I disagree.  If there is just cause to remove ...
10088    Dennis Rodman is NOT the greatest rebounder ev...
23903    “The Legal Aid Society slammed the measure, vo...
35395    Anyone else watch JJ/Zhang and be so hyped up ...
              

In [22]:
model = build_model(albert_layer, max_len=64)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 64)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 64)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 2048), (None 58724864    input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [23]:
history = model.fit(
    train_input, y_train,
    validation_split=0.1,
    epochs=3,
    batch_size=16
)

Train on 27682 samples, validate on 3076 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
test_pred = model.predict(test_input)
predictions = np.argmax(test_pred, axis=-1)
print(predictions)

[1 0 1 ... 1 0 0]


In [28]:
from sklearn.metrics import f1_score,accuracy_score
#print("accuracy_score:",accuracy_score(y_test, predictions))
print("f1_score:",f1_score(y_test, predictions))

f1_score: 0.9446327683615819
