<a href="https://colab.research.google.com/github/Anoif01/Sentiment-Analysis-FrozenLayer/blob/test-colab/FrozenLayer_Movie_Review_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR NOTEBOOK.

# 创建/kaggle/input和/kaggle/working目录，然后在根目录下创建指向这些目录的符号链接（../input和../working），
# 实质上是在Colab中重现了Kaggle的工作环境。无论Notebook中的代码是使用Kaggle的原始路径（/kaggle/input等）还是相对路径（../input等），都可以正确地找到数据和文件，从而使得从Kaggle到Colab的迁移更加顺畅。

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sentiment-analysis-on-movie-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F10025%2F32092%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240209%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240209T171129Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da28f75b012890b46305cf2b4ade452d0ac93bf7fe73640ec4b5dabe418ec843532bb870dcbc4b1518a268541966e9f93a3602b1e8d58192ca05f0e3c337e8d13cb69f5a4126c4ea93237b5430e16ebba0270bd4c238fe436b130f24aacb60d137c405fbddc7c5a4e24d6ca18e311bac86d89ce95c0140872693d2936eece93b6eb7b1abb291fefc9e89c16b7ec0723b5e0ea70d91de9d8e15ca62f4f9b6a08d72de28f287762871e0e470c5fe677e2e5d36333ecc501766ae625a714a49dd5c3e83daf43a8c7f189366f2472ca9c65ddc1de3b5cc9dfd6ce1b2631ca5d368fa78254fb0bb4c65d5fe62d1a49d1c8d250f009741becc22f3351d878126c9d8e3d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading sentiment-analysis-on-movie-reviews, 1991138 bytes compressed
Downloaded and uncompressed: sentiment-analysis-on-movie-reviews
Data source import complete.


## 这是一篇学习向的Notebook，课题是NLP领域的情感分析任务。
### 主要使用的工具是Pytorch和tensorflow。我希望在这里学习到：
    1. 如何调用Bert模型：使用什么库，什么函数？
    2. 如何使用pytorch搭建数据集，导入模型和微调模型？
    3. 如何定义模型的参数，损失函数和metric？
    4. 如何创建submission的csv？
    5. 有什么未来能做的优化？如何提分？能不能举一反三？

In [2]:
import os
import random
import gc

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

from transformers import AutoModel, AutoTokenizer, AutoConfig, AdamW

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

gc.collect()

/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv
/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip
/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip


60

### 0. 设定seed值，用于复现实验。

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

### 1. Prepare data 准备数据集

The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

In [6]:
!apt-get install unzip
!unzip ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip test.tsv
!unzip ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip train.tsv

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unzip is already the newest version (6.0-26ubuntu3.1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.
Archive:  ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip
  inflating: test.tsv                
Archive:  ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip
  inflating: train.tsv               


In [7]:
# check if contain nan
def check_nan(df):
    nan_columns = df.columns[df.isna().any()]
    nan_rows = df[df.isna().any(axis=1)]
    print(f'Contain Nan in columns: {nan_columns}. \n Rows:\n{nan_rows}\n')

In [8]:
train_df = pd.read_csv('train.tsv', sep='\t')
check_nan(train_df)

print(train_df.shape)
print(train_df.info())

train_df.head()

Contain Nan in columns: Index([], dtype='object'). 
 Rows:
Empty DataFrame
Columns: [PhraseId, SentenceId, Phrase, Sentiment]
Index: []

(156060, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
None


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [9]:
# check if contain nan
test_df = pd.read_csv('test.tsv', sep='\t')
check_nan(test_df)

Contain Nan in columns: Index([], dtype='object'). 
 Rows:
Empty DataFrame
Columns: [PhraseId, SentenceId, Phrase]
Index: []



In [10]:
# replace nan with string
test_df = test_df.fillna(' ')
check_nan(test_df)

print(test_df.shape)
print(test_df.info())
test_df.head()

Contain Nan in columns: Index([], dtype='object'). 
 Rows:
Empty DataFrame
Columns: [PhraseId, SentenceId, Phrase]
Index: []

(66292, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB
None


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [11]:
# sample_submission = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')
# print(sample_submission.shape)
# print(sample_submission.info())
# sample_submission.head()

### 2. Text Processing 文本处理

In [12]:
# 通过 transformer 导入 bert-base基础模型 的 分词器tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', lower=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
# 创建数据集对象
class MovieReviewDataset(Dataset):
    def __init__(self, df, max_len, if_train=True):
        self.max_len = max_len
        self.if_train = if_train
        self.text = df['Phrase'].tolist()
        if self.if_train:
            self.sentiments = df['Sentiment'].values
        self.encode = tokenizer.batch_encode_plus(self.text,
                                                    padding='max_length',
                                                    max_length=self.max_len,
                                                    truncation=True, #如果某个序列超过max_length，则该参数指定是否截断它。
                                                    return_attention_mask=True) #是否返回注意力掩码，以指示哪些位置是填充的，哪些是实际数据。
    def __getitem__(self, i):
        input_ids = torch.tensor(self.encode['input_ids'][i])  #编码后的输入序列的ID列表。这些ID对应于模型词汇表中的单词或标记。
        attention_mask = torch.tensor(self.encode['attention_mask'][i])
        if not self.if_train:
            return (input_ids, attention_mask)
        else:
            sentiment = self.sentiments[i]
            return (input_ids, attention_mask, sentiment)

    def __len__(self):
        return len(self.text)

In [14]:
# 创建pytorch可读的数据集和dataloader
max_len = 64
train_dataset = MovieReviewDataset(train_df, max_len)
test_dataset = MovieReviewDataset(test_df, max_len, if_train=False)

# 创建validation数据集
lengths = [int(len(train_dataset) * 0.8), int(len(train_dataset) * 0.2)]
# 将数据集随机分割成给定长度的非重叠新数据集。这个函数特别适合于将数据集划分为训练集、验证集和测试集。
train_dataset, valid_dataset = random_split(train_dataset, lengths=lengths, generator=torch.Generator().manual_seed(42))

# 数据集按batch批次分割，以便模型读取。
bs = 128
train_dataloader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
val_dataloader = DataLoader(valid_dataset, batch_size=bs)
test_dataloader = DataLoader(test_dataset, batch_size=bs)

In [15]:
# show Bert Layer name
bert_base = AutoModel.from_pretrained('bert-base-uncased')
for name, param in bert_base.named_parameters():
    print(f'Name: {name}')
#     print(f'Name: {name}, Param: {param}')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Name: embeddings.word_embeddings.weight
Name: embeddings.position_embeddings.weight
Name: embeddings.token_type_embeddings.weight
Name: embeddings.LayerNorm.weight
Name: embeddings.LayerNorm.bias
Name: encoder.layer.0.attention.self.query.weight
Name: encoder.layer.0.attention.self.query.bias
Name: encoder.layer.0.attention.self.key.weight
Name: encoder.layer.0.attention.self.key.bias
Name: encoder.layer.0.attention.self.value.weight
Name: encoder.layer.0.attention.self.value.bias
Name: encoder.layer.0.attention.output.dense.weight
Name: encoder.layer.0.attention.output.dense.bias
Name: encoder.layer.0.attention.output.LayerNorm.weight
Name: encoder.layer.0.attention.output.LayerNorm.bias
Name: encoder.layer.0.intermediate.dense.weight
Name: encoder.layer.0.intermediate.dense.bias
Name: encoder.layer.0.output.dense.weight
Name: encoder.layer.0.output.dense.bias
Name: encoder.layer.0.output.LayerNorm.weight
Name: encoder.layer.0.output.LayerNorm.bias
Name: encoder.layer.1.attention.self

### 3. Modeling 创建模型

In [16]:
class Model(nn.Module):
    def __init__(self, N=5, forze_embed=True):
        super(Model, self).__init__()

        self.N = N
        self.forze_embed = forze_embed

        # 加载BERT模型的 配置。这个配置包含了模型的各种参数设置，比如隐藏层大小、层数、词汇大小等。
        bert_base_config = AutoConfig.from_pretrained('bert-base-uncased')

        # 加载了预训练的BERT模型 实例，加载正确的模型类和预训练权重
        self.bert_base = AutoModel.from_pretrained('bert-base-uncased')

        for name, param in self.bert_base.named_parameters():
            # 冻结前N个Transformer层
            if name.startswith('encoder.layer') and int(name.split('.')[2]) < self.N:
                param.requires_grad = False

            # 冻结embedding层
            if self.forze_embed and name.startswith('embeddings'):
                param.requires_grad = False

        # 定义了线性分类器，其输入维度为即BERT模型隐藏层的大小，输出维度为5。输出维度通常对应于分类任务的类别数。
        self.classifier = nn.Linear(bert_base_config.hidden_size, 5)

    def forward(self, input_ids, attention_mask):
        # 通过BERT模型对输入进行编码，获取模型的输出。BERT模型的输出是一个包含多个元素的元组，包括最后一层的隐藏状态和pooler输出等。
        bert_base_output = self.bert_base(input_ids=input_ids, attention_mask=attention_mask)
        # get last hidden state
        # bert_base_last_hidden_state = bert_base_output[0]

        # pooler_output – 最后一层的隐藏状态的第一个token（通常是[CLS]标记）经过一个线性层和Tanh激活函数处理后的输出。这个输出适用于分类任务。
        # (classification token) further processed by a Linear layer and a Tanh activation function
        pooler_output = bert_base_output[1] # [batch_size,hidden]
        out = self.classifier(pooler_output)
        return out


In [17]:
# 模型的初始化、优化器的设置以及损失函数的定义。
model = Model(N=8, forze_embed=True)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
gc.collect()



36

In [18]:
# 初始化损失和准确率列表
total_loss = []
total_val_acc = []

# 训练循环
for epoch in range(1):
    model.train()
    epoch_loss = []
    for input_ids, attention_mask, target in tqdm(train_dataloader):
        # 数据迁移到设备, 加速运行
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        # 优化器梯度清零,在计算新的梯度之前清零已累积的梯度，以避免在多次反向传播中梯度累积
        optimizer.zero_grad()

        # 前向传播和损失计算
        y_pred = model(input_ids, attention_mask)
        loss = criterion(y_pred, target)

        # 反向传播和优化器步骤
        loss.backward()
        optimizer.step()

        epoch_loss.append(loss.item())
    # 计算训练的平均loss，并保存
    mean_epoch_loss = np.mean(epoch_loss)
    total_loss.append(mean_epoch_loss)

    # 释放GPU内存
    input_ids = input_ids.to(torch.device('cpu'))
    attention_mask = attention_mask.to(torch.device('cpu'))
    target = target.to(torch.device('cpu'))
    gc.collect()

    # 计算validation集的正确率
    val_accs= []
    model.eval()
    for input_ids, attention_mask, target in tqdm(val_dataloader):
        # 数据迁移到设备, 加速运行
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        target = target.to(device)

        y_pred = model(input_ids, attention_mask)
        _, y_pred = torch.max(y_pred, -1)

        acc = torch.mean((torch.tensor(y_pred.cpu() == target.cpu(), dtype=torch.float)))
        val_accs.append(acc.cpu())
    mean_epoch_acc = np.array(val_accs).mean()
    total_val_acc.append(mean_epoch_acc)

    print("Epoch:", epoch+1, "-- loss:", mean_epoch_loss, "-- acc:", mean_epoch_acc)
    gc.collect()


100%|██████████| 976/976 [11:25<00:00,  1.42it/s]
  acc = torch.mean((torch.tensor(y_pred.cpu() == target.cpu(), dtype=torch.float)))
100%|██████████| 244/244 [01:50<00:00,  2.20it/s]


Epoch: 1 -- loss: 0.9058672724688639 -- acc: 0.6629609


In [19]:
model.eval()
predictions = []
for text, attention_mask in tqdm(test_dataloader):
    text = text.to(device)
    attention_mask = attention_mask.to(device)
    preds = model(text, attention_mask)
    _, preds = torch.max(preds, -1)
    for pred in preds: predictions.append(pred.item())
print(len(predictions))

100%|██████████| 518/518 [03:55<00:00,  2.20it/s]

66292





In [20]:
submission = pd.DataFrame()
submission['PhraseId'] = test_df['PhraseId']
submission['Sentiment'] = predictions
submission.to_csv("submission.csv", index=False)
print("Sumbssion is ready!")

Sumbssion is ready!


In [10]:
from getpass import getpass
import os

# os.environ['USER_EMAIL'] = getpass('Enter your user email: ')
# os.environ['USER_NAME'] = getpass('Enter your user name: ')
os.environ['GIT_REPO'] = getpass('Enter your git repo: ')

Enter your git repo: ··········


In [11]:
!git init
!git remote add origin os.environ['GIT_REPO']
!git config --global user.email os.environ['USER_EMAIL']
!git config --global user.name os.environ['USER_NAME']

!git branch
# create a new branch
# !git checkout -b ver0
# switch to an existing branch
!git checkout ver0

!git add .
!git commit -m "Version 0"
!git push -u origin ver0

Reinitialized existing Git repository in /content/.git/
error: remote origin already exists.
* [32mver0[m
Already on 'ver0'
On branch ver0
nothing to commit, working tree clean
Host key verification failed.
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.
