In [28]:
# 数据分析/处理
import numpy as np
import pandas as pd
import re

# 搭建神经网络
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch import optim
from torch.utils.data import Dataset,DataLoader

# 数据可视化
import matplotlib.pyplot as plt
import warnings

# word2vec
from gensim.models import Word2Vec


warnings.filterwarnings('ignore')
%matplotlib inline

In [29]:
# 验证cuda是否可用
cuda_available=torch.cuda.is_available()
device = torch.device("cuda" if cuda_available else "cpu")
if cuda_available:
    print("CUDA Device Name:", torch.cuda.get_device_name(0))
    print("CUDA Compute Capability:", torch.cuda.get_device_capability(0))

CUDA Device Name: NVIDIA GeForce RTX 2080 Ti
CUDA Compute Capability: (7, 5)


In [30]:
# 宇宙的答案
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f964fad1790>

### 关于词嵌入（Word Embedding）

词嵌入（Word Embedding）指的是根据词汇在文本中的上下文将词汇转化为稠密化的向量（也称为分布式表示）表示的一类算法。之所以称这种技术为嵌入（Embedding）是由于它可以将词汇表示在一个向量空间中，我们甚至可以使用这些表示（一般简称为词向量）进行一些带有语义的运算（例如：king-man+woman=queen，或计算余弦相似度）。最早的基于DL的词嵌入技术Word2Vec由传奇捷克NLP研究员Tomas Mikolov于2013年前后与Ilya、Jeff Dean等人于Google共同开发。

Word2Vec的基本假设就是：每个词汇的含义，取决于它**可能**出现的上下文（事后证明这个假设基本上是正确的）。Word2Vec突破了之前的各种词汇表示方法无法联系上下文的缺点,可以显示词之间的相似关系，且稠密的向量化的表示更适于计算和存储。但是Word2Vec仍然有一些缺点，例如由于反义词由于出现的语境比较接近，所以两个反义词在向量空间中比较接近，正常来讲一对反义词的词向量应该成一个平角才对。

GloVe则是斯坦福大学提出的针对Word2Vec的改进，相较于Word2Vec改进了面对生僻词时的等情况下的性能（尽管Mikolov仍然认为GloVe的效果逊于Word2Vec）。

根据原项目的要求，应当使用以下三种方式对词汇进行表示：
 + Word2Vec（对应原要求中的word embedding，这样处理是因为Word2Vec影响力太大，所以有时有人会用word embedding指代它）
 + 随机初始化
 + GloVe

然后搭建RNN和CNN进行分类。

欲了解更多可以参考以下资料：

[Word2Vec的介绍1](https://zhuanlan.zhihu.com/p/61635013)

[Word2Vec的介绍2](https://zhuanlan.zhihu.com/p/26306795)

[GloVe的介绍1](https://zhuanlan.zhihu.com/p/50946044)

[GloVe的介绍2](https://zhuanlan.zhihu.com/p/42073620)

[GloVe的介绍3](https://zh.gluon.ai/chapter_natural-language-processing/glove.html)

另外，Word2Vec和GloVe并不是唯一的一种分布式词向量表示方法。另一类常用的分布式词表示方法还有ELMo和Bert，这类方法可以通过考察词语所在的上下文语境动态的形成词向量。

### 载入数据\处理数据

对数据的分析同task1 

Word2Vec使用Gensim库内置的实现，在我们自己的语料库中进行训练

随机初始化使用torch原生的随机化Embedding

GloVe使用斯坦福原生的预训练权重


In [31]:
# 数据在task1文件架中
test=pd.read_csv("../task1/test.tsv", delimiter="\t").drop(columns=["PhraseId","SentenceId"])
train=pd.read_csv("../task1/train.tsv", delimiter="\t").drop(columns=["PhraseId","SentenceId"])
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Phrase     156060 non-null  object
 1   Sentiment  156060 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [32]:
maxSenLen=0
for i in train["Phrase"].to_list():
    maxSenLen=max(maxSenLen,len(i))
print(maxSenLen)

283


In [33]:
# from task1.ipynb
from nltk.corpus import stopwords

def tokenization_to_ngram(sentence,n=1):
    '''
    将句子转化为token,去除停止词,并返回用于n-gram语言建模的特征
    '''
    # stop_words = set(stopwords.words('english'))
    # print(type(sentence))
    words = re.sub("[^\w]", " ",  sentence).split() 
    filtered_sentence=[w.lower() for w in words ] #if not w in stop_words
    output=[]
    if(n!=1):
        if(len(filtered_sentence)<n):
            # 对于n>len(filtered_sentence)的情况，直接将句子中所有的词拼接
            tmp=''
            for i in range(len(filtered_sentence)):
                tmp=tmp+filtered_sentence[i]
            output.append(tmp)
            return output
        else:
            # 对于其他情况，将句子分解为n个词一份
            for i in range(len(filtered_sentence)-n+1):
                # 这一步是将n个单词拼在一起作为一个单词，这样的话可以视作一个单词，方便一会儿进行哈希
                tmp=filtered_sentence[i]
                for t in range(1,n):
                    tmp+=filtered_sentence[i+t]
                output.append(tmp)
            return output
    else:
        return filtered_sentence


In [34]:
# 将句子转化为列表
# sentence_list=pd.concat([train["Phrase"],test["Phrase"]],axis=0,ignore_index=True)\
#                 .apply(tokenization_to_ngram,n=1).to_list()
sentence_list=train["Phrase"].apply(tokenization_to_ngram,n=1).to_list()
sentence_list[0]

['a',
 'series',
 'of',
 'escapades',
 'demonstrating',
 'the',
 'adage',
 'that',
 'what',
 'is',
 'good',
 'for',
 'the',
 'goose',
 'is',
 'also',
 'good',
 'for',
 'the',
 'gander',
 'some',
 'of',
 'which',
 'occasionally',
 'amuses',
 'but',
 'none',
 'of',
 'which',
 'amounts',
 'to',
 'much',
 'of',
 'a',
 'story']

In [35]:
# test 数据集没有标签，我们使用将train分割成我们需要的大小
data,label=\
    train["Phrase"].apply(tokenization_to_ngram,n=1).to_list(),\
    train["Sentiment"].to_list()
test_size=int(0.1*len(data))
train_size=len(data)-test_size
train_data,train_label=data[:train_size],label[:train_size]
test_data,test_label=data[train_size:],label[train_size:]

### 建模  
使用CNN和RNN进行分类，使用Dropout防止过拟合

### RNN

参考：[pytorch官方文档](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)

In [36]:

class RNN(nn.Module):
    def __init__(self,hidden_size,out_size,d_model=50,num_layers=1,bid=False,dropout=0.1) -> None:
        super(RNN,self).__init__()
        # h0的维度
        self.hidden_size=hidden_size
        # 有几层，一般来说，有一层或者两层就足够了
        self.num_layers=num_layers
        # 是否是双向RNN
        self.bid=bid
        self.rnn=nn.RNN(d_model,hidden_size,
                        num_layers,batch_first=True,dropout=dropout,bidirectional=bid)
        self.dropout=nn.Dropout(dropout)
        self.project=nn.Sequential(
            nn.Linear(hidden_size,out_size),
            nn.Softmax()
            )
    def forward(self,X):
        # deft: self.bid*self.num_layers=1, batch_size=X.size(0) 
        if self.bid==True:
            h0=torch.zeros(2*self.num_layers,X.size(0),self.hidden_size).to(device)
        else:
            h0=torch.zeros(self.num_layers,X.size(0),self.hidden_size).to(device)
        X,_=self.rnn(X,h0)
        X=self.dropout(X)
        output=self.project(X[:,-1,:])
        return output 

In [37]:
# test
a=torch.ones(3,10,5).to(device)
rnn=RNN(5,6,5).to(device)
out=rnn(a)
print(out.size())

torch.Size([3, 6])


### CNN

参考：[TextCNN](https://arxiv.org/abs/1408.5882)

In [38]:
class TextCNN(nn.Module):
    # TextCNN only has 3 layers
    def __init__(self,hidden_size,out_size,d_model=50,kernel_size=[3,4,5],dropout=0.1) -> None:
        super(TextCNN,self).__init__()
        self.hidden_size=hidden_size
        self.d_model=d_model
        self.ConvBlocks=nn.ModuleList([
            nn.Conv2d(1,hidden_size,(i,d_model)) for i in kernel_size])
        self.project=nn.Sequential(
            nn.Linear(hidden_size*len(kernel_size),out_size),
            nn.Softmax()
            )
        
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,X):
        # [batch,sentencelen,d_model]-->[batch,1,sentencelen,d_model]
        X=X.unsqueeze(1)
        pool_out=[]
        for conv in self.ConvBlocks:
            # [batch,1,sentencelen,d_model]-->[batch,hidden_size,sentencelen,1]    
            conv_out=F.relu(conv(X))
            # [batch,hidden_size,sentencelen,1]-->[batch,hidden_size,1,1]
            pool=F.max_pool2d(conv_out,(conv_out.size(2),1))
            # [batch,hidden_size,1,1]-->[batch,hidden_size,1]
            pool=pool.squeeze(3)
            pool_out.append(pool)
        # [batch,hidden_size,1]-->[batch,hidden_size*3,1]
        X=torch.cat(pool_out,dim=1)
        # just to make sure the size of tensor will match
        X=X.squeeze(2)
        X=self.dropout(X)
        output=self.project(X)
        return output

In [39]:
# test
a=torch.ones(3,10,5).to(device)
cnn=TextCNN(3,4,5).to(device)
out=cnn(a)
print(out.size())

torch.Size([3, 4])


### Word2Vec

In [40]:
# 词向量维度为50维,sg=1表示使用skip-gram
model1=Word2Vec(sentence_list,vector_size=50,sg=1)
model1.vector_size
# model1.save("Word2Vec.task2.model")

50

In [41]:
def sen2word2vec(sentence_list,model=model1,maxlen=maxSenLen):
    vecList=np.zeros((maxlen,model.vector_size)) 
    if(sentence_list==[]):
        return vecList 
    # 将列表句子转化为稠密词向量句子
    # vecList=np.array([])
    for i,e in enumerate(sentence_list):
        if(e in model.wv):
            vecList[i]=model.wv[e]
    return vecList

s=['a','good','kjell']
# print(model1.wv[s])
p=sen2word2vec(s,model1)
print(p[0]==model1.wv['a'])

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]


### 随机向量

In [42]:
# 需要确定字典中词的个数
# 较为简单的处理方法，直接设定一个较大的数字作为估计
v_size=20000
# 随后需要对每个单词都进行编码

In [43]:
class RandomEmbedding(nn.Module):
    def __init__(self,vocab_size=v_size,d_model=50,sentence_len=maxSenLen) -> None:
        super(RandomEmbedding,self).__init__()
        self.senLen=sentence_len
        self.d_model=d_model
        self.embedding=nn.Embedding(vocab_size,d_model)
        # self.embedding.weight.data.normal_(mean=0,std=1) 

    def forward(self,X):
        # context=self.embedding(X)
        vecList=torch.zeros((self.senLen,self.d_model))
        if(X==[]):
            return vecList 
        for i,w in enumerate(X):
            vecList[i]=self.embedding[w]
        return vecList 
        # if(context.size(0)<self.senLen):
        # l,_=context.size(0)
        # if(l==self.senLen):
        #     return context
        # else:
        #     for i in range(self.senLen-l):
        #         z=torch.zeros((1,self.d_model))
        #         context=torch.cat((context,z),dim=0)
        #     return context
  

### GloVe

In [44]:
import torchtext

def sen2GloVe(sentence_list,model='6B',d_model=50,maxlen=maxSenLen):
    g=torchtext.vocab.GloVe(model,d_model)
    vecList=torch.zeros((maxlen,d_model))
    if(sentence_list==[]):
        return vecList
    for i,w in enumerate(sentence_list):
        vecList[i]=g[w]
    return vecList
    
s=['a','good','flim']
p=sen2GloVe(s)
print(p.shape)


.vector_cache/glove.6B.zip: 0.00B [00:00, ?B/s]

.vector_cache/glove.6B.zip:   0%|          | 3.45M/862M [02:07<8:48:24, 27.1kB/s]  


KeyboardInterrupt: 

### 数据集类

In [None]:
class MyDateset(Dataset):
    def __init__(self,data,label,model='w') -> None:
        '''
        model='w'/'r'/'g',means word2vec/random/GloVe
        '''
        super().__init__()
        self.data=data
        self.label=label
        self.model=model
        if(self.model=='r'):
            self.randvec=RandomEmbedding()
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sentence=self.data[index]
        label=self.label[index]
        # 这不是一种高效率的写法，但是在这里使用的话还是可以的
        if(self.model=='w'):
            return sen2word2vec(sentence),label
        elif(self.model=='r'):
            return self.randvec(sentence),label
        elif(self.model=='g'):
            return sen2GloVe(sentence),label

In [None]:

def eval_model(model,test):
    model.eval()
    t=0
    for x,y in test:
        x=x.float()
        output=model(x)
        if(y==output.argmax()):
            t+=1
    print('Eval, acc: {:.4f}'.format(t/test.__len__()))        

In [None]:
def TrainModel(model,train,epoch=3):
    model.train()
    loss_fn=nn.CrossEntropyLoss()
    optimizer=optim.SGD(model.parameters(),lr=0.00001)
    for i in range(epoch):
        for x,y in train:
            x,y=x.to(device),y.to(device)
            x=x.float()
            output=model(x)
            loss=loss_fn(output,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch [{}/{}], Loss: {:.4f}'.format(i+1, epoch, loss.item()))
    return model

In [None]:
# config
# r,g
embedding_models='g'
batch_size=64
epoch=3

In [None]:
train_dataset=MyDateset(train_data,train_label,model=embedding_models)
train_dataloader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_dataset=MyDateset(test_data,test_label,model=embedding_models)
test_dataloader=DataLoader(test_dataset,batch_size=1)

In [None]:
# RNN
# rnn=RNN(256,5)
# rnn=TrainModel(rnn,train_dataloader,epoch=epoch)
# eval_model(rnn,test_dataloader,epoch=epoch)


In [51]:
# CNN
cnn=TextCNN(256,5).to(device)
cnn=TrainModel(cnn,train_dataloader,epoch=epoch)
eval_model(cnn,test_dataloader)

Epoch [1/3], Loss: 1.6089
Epoch [2/3], Loss: 1.6002
Epoch [3/3], Loss: 1.6058
Eval, acc: 0.2655
