In [4]:
import pandas as pd 
import numpy as np
import re

In [5]:
path_train = r'/home/kesci/input/qiu_assignment7108/train.tsv'
path_test = r'/home/kesci/input/qiu_assignment7108/test.tsv'

In [6]:
train = pd.read_csv(path_train,sep = '\t')
test = pd.read_csv(path_test,sep = '\t')

In [7]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [8]:
corpus = train['Phrase'].tolist()

In [9]:
#大小写转换，去除标点符号
for i in range(len(corpus )):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i])
    corpus [i] = re.sub(r'\s+',' ',corpus [i])

In [10]:
corpus[1:3]

['a series of escapades demonstrating the adage that what is good for the goose',
 'a series']

In [11]:
Y = train['Sentiment']

In [12]:
set(Y)

{0, 1, 2, 3, 4}

In [13]:
##分词
def tokenize(sentences, token='word'):
    """Split sentences into word or char tokens"""
    if token == 'word':
        return [sentence.split(' ') for sentence in sentences]
    elif token == 'char':
        return [list(sentence) for sentence in sentences]
    else:
        print('ERROR: unkown token type '+token)

In [14]:
df = tokenize(corpus)

In [15]:
df[22:25]

[['good'], ['for', 'the', 'goose'], ['for']]

In [16]:
##词频

##创建词频字典
wordfreq = {}
for tokens in  df:
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

In [17]:
##挑选最常出现的200个
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

In [18]:
##对每一个评论建立词向量
sentence_vectors = []
for tokens in df:
    sent_vec = []
    for token in most_freq:
        if token in tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

In [19]:
#生成每个单词在每个文本的tf值
word_idf_values = {}
for token in most_freq:
    doc_containing_word = 0
    for document in df:
        if token in document:
            doc_containing_word += 1
    word_idf_values[token] = np.log(len(corpus)/(1 + doc_containing_word))

In [20]:
word_tf_values = {}#每个单词在每个句子中的tf值
for token in most_freq:
    sent_tf_vector = []
    for document in df:
        doc_freq = 0
        for word in document:
            if token == word:
                  doc_freq += 1
        word_tf = doc_freq/len(document)
        sent_tf_vector.append(word_tf)
    word_tf_values[token] = sent_tf_vector

In [21]:
tfidf_values = []
for token in word_tf_values.keys():
    tfidf_sentences = []
    for tf_sentence in word_tf_values[token]:
        tf_idf_score = tf_sentence * word_idf_values[token]
        tfidf_sentences.append(tf_idf_score)
    tfidf_values.append(tfidf_sentences)

In [22]:
len(tfidf_values)

200

In [23]:
tfidf_values_np = np.array(tfidf_values)

In [24]:
tfidf_values_np.shape

(200, 156060)

### 分割数据集

In [25]:
#选择三七分
train_size = 156060 * 0.7
test_size = 156060 * 0.3

print('train_size is : {},test size is {}'.format(train_size,test_size))

train_size is : 109242.0,test size is 46818.0


In [31]:
train_X = tfidf_values_np[:,:109242]
test_X = tfidf_values_np[:,109242:]
train_Y = Y[:109242]
test_Y = Y[109242:]

print(train_X.shape,len(train_Y))

(200, 109242) 109242


In [59]:
batch_num = 109242//64
batch_num

1706

### 模型训练

In [28]:
#定义accuracy以及softmax计算公式
def getAccuracy(y_hat, y):
        return sum((y_hat_i.argmax(axis=1) == y_i) for y_i, y_hat_i in zip(list(y.T), list(y_hat.T))) / len(list(y.T))
def softmax(x):
    exp_x = np.exp(x)
    partition = np.sum(exp_x,axis = 0) #按列相加
    return exp_x / partition     

In [29]:
#定义梯度计算公式
def partial_w(x, y, y_hat):
    n = len(y)

    gradient = 0
    
    for x_i, y_i, y_hat_i in zip(list(x.T), list(y.T), list(y_hat.T)):
        loss = y_i.reshape(outputs,1) - y_hat_i.reshape(outputs,1)
        gradient += loss * x_i.reshape(inputs,1).T
    return -1 / n * gradient

def partial_b(x, y, y_hat):
    n = len(y)

    gradient = 0
    
    for y_i, y_hat_i in zip(list(y.T), list(y_hat.T)):
        gradient += y_i.reshape(outputs,1) - y_hat_i.reshape(outputs,1)
    
    return -1 / n * gradient

In [33]:
#先用一个小批量进行训练测试
small_X = train_X[:,:64]
small_Y = train_Y[:64]
trying_times = 2000
max_accuracy = 0
outputs =  5  #5个输出类别
inputs = 200    
X = small_X
W = np.random.normal(0,0.1,(outputs,inputs)) 
b = np.zeros(outputs).reshape(outputs,1)
y = small_Y
y_mat = np.eye(outputs)[Y].T

learning_rate = 1e-04


update_time = 0

for i in range(trying_times):
    
    y_hat = softmax(np.dot(W,X) + b) 
    def getAccuracy(y_hat, y):
        return sum((y_hat_i.argmax(axis=0) == y_i) for y_i, y_hat_i in zip(list(y), list(y_hat.T))) / len(list(y.T))
    
    current_accuracy = getAccuracy(y_hat, y)
    if current_accuracy > max_accuracy:
        max_accuracy = current_accuracy
    
    if i % 50 == 0: 
          print('When time is : {},accuracy is : {}'.format(i, max_accuracy))

    #计算梯度
    w_gradient = partial_w(X,y_mat,y_hat)
    
    b_gradient = partial_b(X, y_mat, y_hat )
    
    #更新梯度
    W = W + (-1 * w_gradient) * learning_rate

    b = b + (-1 * b_gradient) * learning_rate

When time is : 0,accuracy is : 0.15625
When time is : 50,accuracy is : 0.453125
When time is : 100,accuracy is : 0.546875
When time is : 150,accuracy is : 0.609375
When time is : 200,accuracy is : 0.671875
When time is : 250,accuracy is : 0.75
When time is : 300,accuracy is : 0.796875
When time is : 350,accuracy is : 0.859375
When time is : 400,accuracy is : 0.859375
When time is : 450,accuracy is : 0.859375
When time is : 500,accuracy is : 0.859375
When time is : 550,accuracy is : 0.859375
When time is : 600,accuracy is : 0.859375
When time is : 650,accuracy is : 0.859375
When time is : 700,accuracy is : 0.875
When time is : 750,accuracy is : 0.875
When time is : 800,accuracy is : 0.890625
When time is : 850,accuracy is : 0.890625
When time is : 900,accuracy is : 0.890625
When time is : 950,accuracy is : 0.890625
When time is : 1000,accuracy is : 0.890625
When time is : 1050,accuracy is : 0.890625
When time is : 1100,accuracy is : 0.890625
When time is : 1150,accuracy is : 0.890625
Wh

In [60]:
#正式训练
epochs = 5
max_accuracy = 0
outputs =  5  #5个输出类别
inputs = 200    
W = np.random.normal(0,0.1,(outputs,inputs)) 
b = np.zeros(outputs).reshape(outputs,1)
y_mat = np.eye(outputs)[Y].T

learning_rate = 1e-04
for i in range(epochs):
    start =0
    end = 64
    for j in range(batch_num):
        X = train_X[:,start:end]
        y = train_Y[start:end]
        y_hat = softmax(np.dot(W,X) + b) 
        current_accuracy = getAccuracy(y_hat, y)
        if current_accuracy > max_accuracy:
            max_accuracy = current_accuracy
        
        #计算梯度
        w_gradient = partial_w(X,y_mat,y_hat)
        
        b_gradient = partial_b(X, y_mat, y_hat )
        
        #更新梯度
        W = W + (-1 * w_gradient) * learning_rate
    
        b = b + (-1 * b_gradient) * learning_rate
        
        start += 64
        end += 64
    print('When epoch is : {},accuracy is : {}'.format(i, max_accuracy))

When epoch is : 0,accuracy is : 0.890625
When epoch is : 1,accuracy is : 0.890625
When epoch is : 2,accuracy is : 0.890625
When epoch is : 3,accuracy is : 0.890625
When epoch is : 4,accuracy is : 0.890625
