# 用TFLearn进行电影评价的情感分析



In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

In [6]:
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)
reviews.head(10)

Unnamed: 0,0
0,bromwell high is a cartoon comedy . it ran at ...
1,story of a man who has unnatural feelings for ...
2,homelessness or houselessness as george carli...
3,airport starts as a brand new luxury pla...
4,brilliant over acting by lesley ann warren . ...
5,this film lacked something i couldn t put my ...
6,this is easily the most underrated film inn th...
7,sorry everyone i know this is supposed to b...
8,this is not the typical mel brooks film . it w...
9,when i was little my parents took me along to ...


In [8]:
from collections import Counter

total_counts = Counter()
for i, row in (reviews.iterrows()):
    total_counts.update(row[0].split(" "))

print("Total words in data set: ", len(total_counts))

Total words in data set:  74074


##  取total_counts的前10000个

In [9]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:10000]
print(vocab[:60])

['', 'the', '.', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'you', 'on', 't', 'not', 'he', 'are', 'his', 'have', 'be', 'one', 'all', 'at', 'they', 'by', 'an', 'who', 'so', 'from', 'like', 'there', 'her', 'or', 'just', 'about', 'out', 'if', 'has', 'what', 'some', 'good', 'can', 'more', 'she', 'when', 'very', 'up', 'time', 'no']


What's the last word in our vocabulary? We can use this to judge if 10000 is too few. If the last word is pretty common, we probably need to keep more words.

In [10]:
print(vocab[-1], ': ', total_counts[vocab[-1]])

fulfilled :  30


### 将词袋转换为向量

In [75]:
word2idx = {word : i for i, word in enumerate(vocab)}


### 文本转换为向量


In [76]:
def text_to_vector(text):
    te2ve = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split(" "):
        id = word2idx.get(word, None)
        if id == None:
            continue
        else:
            te2ve[id] += 1
                     
    return te2ve
                

If you do this right, the following code should return

```
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake')[:65]
                   
array([0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0])
```       

In [77]:
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake')[:65]


array([0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0])

### 将所有reviews转化为词袋向量

In [80]:
word_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.int_)
for ii, (_, text) in enumerate(reviews.iterrows()):
    word_vectors[ii] = text_to_vector(text[0])

In [82]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]

array([[ 18,   9,  27,   1,   4,   4,   6,   4,   0,   2,   2,   5,   0,
          4,   1,   0,   2,   0,   0,   0,   0,   0,   0],
       [  5,   4,   8,   1,   7,   3,   1,   2,   0,   4,   0,   0,   0,
          1,   2,   0,   0,   1,   3,   0,   0,   0,   1],
       [ 78,  24,  12,   4,  17,   5,  20,   2,   8,   8,   2,   1,   1,
          2,   8,   0,   5,   5,   4,   0,   2,   1,   4],
       [167,  53,  23,   0,  22,  23,  13,  14,   8,  10,   8,  12,   9,
          4,  11,   2,  11,   5,  11,   0,   5,   3,   0],
       [ 19,  10,  11,   4,   6,   2,   2,   5,   0,   1,   2,   3,   1,
          0,   0,   0,   3,   1,   0,   1,   0,   0,   0]])

### 得到训练集和测试集

In [83]:
Y = (labels=='positive').astype(np.int_)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

In [84]:
trainY

array([[ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.],
       ..., 
       [ 1.,  1.],
       [ 1.,  1.],
       [ 1.,  1.]])

# 搭建神经网络

In [88]:
# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    #### Your code ####
    net = tflearn.input_data([None, 10000])
    
    net = tflearn.fully_connected(net, 200, activation='ReLU')
    net = tflearn.fully_connected(net, 25, activation='ReLU')
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
    model = tflearn.DNN(net)
    return model


## 初始化模型

In [89]:
model = build_model()

## 训练网络

In [90]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=10)

Training Step: 1589  | total loss: [1m[32m1.38630[0m[0m | time: 6.537s
| SGD | epoch: 010 | loss: 1.38630 - acc: 0.5302 -- iter: 20224/20250
Training Step: 1590  | total loss: [1m[32m1.38630[0m[0m | time: 7.594s
| SGD | epoch: 010 | loss: 1.38630 - acc: 0.5303 | val_loss: 1.38630 - val_acc: 0.5547 -- iter: 20250/20250
--


## 进行测试

In [91]:
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.54
