In [4]:
# 数据分析/处理
import numpy as np
import pandas as pd
import re

# 搭建神经网络
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch import optim
from torch.utils.data import Dataset,DataLoader

# 数据可视化
import matplotlib.pyplot as plt
import warnings

# word2vec
from gensim.models import Word2Vec


warnings.filterwarnings('ignore')
%matplotlib inline

In [5]:
# 验证cuda是否可用
cuda_available=torch.cuda.is_available()
device = torch.device("cuda" if cuda_available else "cpu")
if cuda_available:
    print("CUDA Device Name:", torch.cuda.get_device_name(0))
    print("CUDA Compute Capability:", torch.cuda.get_device_capability(0))
# 宇宙的答案
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x2638c2bb130>

### 数据集  
这里使用的数据集是斯坦福大学提供的SNLI数据集，这个数据集中有两种可以处理的数据类型。  
+ json
+ txt

在本数据集中，两类文件部分内容有差异，但是我们所需要的两个句子和标签在两类文件中是完全一样的。也就是说，我们仅需要载入json或者txt两种格式中的一种来进行训练。

In [6]:
# use txt
# train=pd.read_csv("snli_1.0\snli_1.0_train.txt",delimiter='\t')
# use json
train=pd.read_json("snli_1.0\snli_1.0_train.jsonl",lines=True)
train=pd.concat([train["annotator_labels"],train["sentence1"],train["sentence2"]],axis=1)
train.head()

Unnamed: 0,annotator_labels,sentence1,sentence2
0,[neutral],A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,[contradiction],A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,[entailment],A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,[neutral],Children smiling and waving at camera,They are smiling at their parents
4,[entailment],Children smiling and waving at camera,There are children present


In [7]:
dev=pd.read_json("snli_1.0\snli_1.0_dev.jsonl",lines=True)
dev=pd.concat([dev["annotator_labels"],dev["sentence1"],dev["sentence2"]],axis=1)
dev.head()

Unnamed: 0,annotator_labels,sentence1,sentence2
0,"[neutral, entailment, neutral, neutral, neutral]",Two women are embracing while holding to go pa...,The sisters are hugging goodbye while holding ...
1,"[entailment, entailment, entailment, entailmen...",Two women are embracing while holding to go pa...,Two woman are holding packages.
2,"[contradiction, contradiction, contradiction, ...",Two women are embracing while holding to go pa...,The men are fighting outside a deli.
3,"[entailment, entailment, entailment, entailmen...","Two young children in blue jerseys, one with t...",Two kids in numbered jerseys wash their hands.
4,"[neutral, neutral, neutral, entailment, entail...","Two young children in blue jerseys, one with t...",Two kids at a ballgame wash their hands.


In [8]:
test=pd.read_json("snli_1.0\snli_1.0_test.jsonl",lines=True)
test=pd.concat([test["annotator_labels"],test["sentence1"],test["sentence2"]],axis=1)
test.head()

Unnamed: 0,annotator_labels,sentence1,sentence2
0,"[neutral, contradiction, contradiction, neutra...",This church choir sings to the masses as they ...,The church has cracks in the ceiling.
1,"[entailment, entailment, entailment, neutral, ...",This church choir sings to the masses as they ...,The church is filled with song.
2,"[contradiction, contradiction, contradiction, ...",This church choir sings to the masses as they ...,A choir singing at a baseball game.
3,"[neutral, neutral, neutral, neutral, neutral]","A woman with a green headscarf, blue shirt and...",The woman is young.
4,"[entailment, entailment, contradiction, entail...","A woman with a green headscarf, blue shirt and...",The woman is very happy.


### tokenizer

In [11]:
# from task1.ipynb
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

def tokenization_to_ngram(sentence,n=1):
    '''
    将句子转化为token,去除停止词,并返回用于n-gram语言建模的特征
    '''
    stop_words = set(stopwords.words('english'))
    # print(type(sentence))
    words = re.sub("[^\w]", " ",  sentence).split() 
    filtered_sentence=[w.lower() for w in words if not w in stop_words]
    output=[]
    if(n!=1):
        if(len(filtered_sentence)<n):
            # 对于n>len(filtered_sentence)的情况，直接将句子中所有的词拼接
            tmp=''
            for i in range(len(filtered_sentence)):
                tmp=tmp+filtered_sentence[i]
            output.append(tmp)
            return output
        else:
            # 对于其他情况，将句子分解为n个词一份
            for i in range(len(filtered_sentence)-n+1):
                # 这一步是将n个单词拼在一起作为一个单词，这样的话可以视作一个单词，方便一会儿进行哈希
                tmp=filtered_sentence[i]
                for t in range(1,n):
                    tmp+=filtered_sentence[i+t]
                output.append(tmp)
            return output
    else:
        return filtered_sentence


In [12]:
sentence_list=[]
sentence_list=sentence_list+train["sentence1"].apply(tokenization_to_ngram,n=1).to_list()\
    +train["sentence2"].apply(tokenization_to_ngram,n=1).to_list()\
    +dev["sentence1"].apply(tokenization_to_ngram,n=1).to_list()\
    +dev["sentence2"].apply(tokenization_to_ngram,n=1).to_list()\
    +test["sentence1"].apply(tokenization_to_ngram,n=1).to_list()\
    +test["sentence2"].apply(tokenization_to_ngram,n=1).to_list()
sentence_list[0]

['a', 'person', 'horse', 'jumps', 'broken', 'airplane']

In [13]:
len(sentence_list)

1140304

In [14]:
# 使用word2vec时，需要先将句子输入做训练
model1=Word2Vec(sentence_list,vector_size=50,sg=1)
model1.vector_size

50

In [16]:
maxSenLen=0
for i in sentence_list:
    maxSenLen=max(maxSenLen,len(i))
print(maxSenLen)

46


In [17]:
def sen2word2vec(sentence_list,model=model1,maxlen=maxSenLen):
    vecList=np.zeros((maxlen,model.vector_size)) 
    if(sentence_list==[]):
        return vecList 
    # 将列表句子转化为稠密词向量句子
    # vecList=np.array([])
    for i,e in enumerate(sentence_list):
        if(e in model.wv):
            vecList[i]=model.wv[e]
    return vecList

s=['a','good','kjell']
# print(model1.wv[s])
p=sen2word2vec(s,model1)
print(p[0]==model1.wv['a'])

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]
