In [4]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# 1. 数据预处理

In [5]:
testdf = pd.read_csv("YT_Videos_Comments.csv")
testdf.head()
testdf.columns

  testdf = pd.read_csv("YT_Videos_Comments.csv")


Index(['User', 'Video Title', 'Video Description', 'Video ID',
       'Comment (Displayed)', 'Comment (Actual)', 'Comment Author',
       'Comment Author Channel ID', 'Comment Time'],
      dtype='object')

## 1.1 选择Comment和Comment Author作为特征构建的主要内容
我的想法：对评论分词，然后对CommentAuthor计数，就这样

### 去缺失值

In [6]:
print(testdf.isnull().sum())

testdf = testdf.dropna(subset=['Comment (Actual)'])
testdf = testdf.dropna(subset=['Comment Author'])
testdf = testdf.loc[:,['User', 'Video Title', 'Comment (Actual)', 'Comment Author']]
testdf.head()

User                              0
Video Title                     149
Video Description            196767
Video ID                     429330
Comment (Displayed)          467375
Comment (Actual)             482881
Comment Author               482909
Comment Author Channel ID    482868
Comment Time                 482862
dtype: int64


Unnamed: 0,User,Video Title,Comment (Actual),Comment Author
0,Cleo Abram,"Robots made of spiders (yes, really)",zombie spider!! bomb the damn lab before it's ...,Bagus Hutomo
1,Cleo Abram,"Robots made of spiders (yes, really)","This is way less cool than it seems, spiders a...",CMZ neu
2,Cleo Abram,"Robots made of spiders (yes, really)",Spiders see this and this is why they made the...,Kiana Marrie
3,Cleo Abram,"Robots made of spiders (yes, really)",you looks pretty 😍,Noob
4,Cleo Abram,"Robots made of spiders (yes, really)",I can hear the hairs standing up on my wife’s ...,chancellor9000


### 构建计数变量

In [7]:
# 构建CommentAuthor计数向量
testdf['Comment Author Counts'] = testdf['Comment Author'].map(testdf['Comment Author'].value_counts())
print(testdf.shape)
print(testdf.isnull().sum())
testdf.head()


(379032, 5)
User                     0
Video Title              0
Comment (Actual)         0
Comment Author           0
Comment Author Counts    0
dtype: int64


Unnamed: 0,User,Video Title,Comment (Actual),Comment Author,Comment Author Counts
0,Cleo Abram,"Robots made of spiders (yes, really)",zombie spider!! bomb the damn lab before it's ...,Bagus Hutomo,1
1,Cleo Abram,"Robots made of spiders (yes, really)","This is way less cool than it seems, spiders a...",CMZ neu,2
2,Cleo Abram,"Robots made of spiders (yes, really)",Spiders see this and this is why they made the...,Kiana Marrie,1
3,Cleo Abram,"Robots made of spiders (yes, really)",you looks pretty 😍,Noob,6
4,Cleo Abram,"Robots made of spiders (yes, really)",I can hear the hairs standing up on my wife’s ...,chancellor9000,1


### 去除特殊符号

In [8]:
# def remove_punctuation(text):
#     pattern = re.compile(r'[^\w\s]')
#     return re.sub(pattern, '', text)

# testdf['Comment (Actual)'] = testdf['Comment (Actual)'].apply(remove_punctuation)

# testdf.head()

## 1.2 文本预处理，我照着gpt干的

In [9]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# 分词+去停用词函数，这边需要下载一个语料库
def text_segmentation(text):
    # Remove punctuation marks
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove emojis
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # Remove stopwords
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    
    # stemmer
    tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(tokens)

testdf["Comment Seg"] = testdf["Comment (Actual)"].apply(text_segmentation)
testdf.head()

KeyboardInterrupt: 

## 1.3 保存

In [20]:
# testdf.to_csv("preProcessdata.csv", index=False)
df = pd.read_csv("preProcessdata.csv")
df.head()
# df存在缺失值，太无效啦
df[df["Comment Seg"].isnull()]
df = df.dropna(subset=['Comment Seg'])


# 2. Bert生成语句向量

第二种方法使用BERT模型生成向量的原理是基于pooling操作，即对BERT模型最后一层输出执行池化操作得到文本的向量表示。

具体地说，在BERT模型中，每个输入序列（例如单个句子或多个句子）都被编码为一个固定长度的向量序列。对于每个位置i，BERT模型会输出一个大小为hidden_size的向量hi，其中hidden_size是预训练模型的隐藏状态大小。

因此，我们可以将BERT模型的输出视为一个形状为(batch_size, seq_len, hidden_size)的张量，其中batch_size是批次大小，seq_len是最大序列长度，hidden_size是模型的隐藏状态大小。

在第二种方法中，为了将整个输入文本表示为单个向量，可以通过在最后一层隐藏状态上执行平均池化（mean pooling）或最大池化（max pooling）等操作来组合所有单词向量。常用的是平均池化，即将所有单词向量的值相加并除以总数，得到一个平均向量作为文本的向量表示。这样得到的向量就可以代表输入文本，并输入到下游任务中进行分类、聚类等操作。

需要注意的是，由于BERT模型对于长序列的处理能力较强，因此在使用第二种方法时，可以选择保留较长的序列，以充分利用BERT模型的能力。同时，还需要根据具体情况选择合适的池化方式和向量维度大小。

In [21]:
from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
text_column = df['Comment Seg']
vectors = []
for text in text_column:
    tokens = tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)
    input_ids = torch.tensor([tokens])
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs[0]
        pooled = torch.mean(embeddings, dim=1)
    vectors.append(pooled.numpy())

vectors = np.vstack(vectors)
