In [1]:
import re

### tokenize

In [2]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.
    
    Steps:
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], [`] ans ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    return text

In [3]:
clean_text("/./<div>This is not a sentence././_*<\div>").split()

['this', 'is', 'not', 'a', 'sentence']

### The simplest text vectorization technique is Bag Of Words (BOW). It starts with a list of words called the vocabulary (this is often all the words that occur in the training data).
- Training texts: ["This is a good cat", "This is a bad day"]
- => vocabulary: [this, cat, day, is, good, a, bad]
- New text: "This day is a good day"   -->   [1, 0, 2, 1, 1, 1, 0]
- 可以看出，未出现的单词就是0了
- the that this 都是 stop words

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [9]:
training_texts = [
    "This is a good cat",
    "This is a bad day"
]

test_texts = [
    "This day is a good day"
]

# this vectorizer will skip stop words
vectorizer = CountVectorizer(
    stop_words = "english",
    preprocessor = clean_text
)

# fit the vectorizer on the training text
vectorizer.fit(training_texts)

# get the vectorizer's vocabbulary
inv_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
vocabulary = [inv_vocab[i] for i in range(len(inv_vocab))]

# vectorization example
pd.DataFrame(
    data = vectorizer.transform(test_texts).toarray(),
    index = ["test sentence"],
    columns = vocabulary
)

Unnamed: 0,bad,cat,day,good
test sentence,0,0,2,1


### Use Case : IMDB Movie Reviews

In [10]:
import os
import numpy as np
import pandas as pd

In [11]:
def load_train_test_imdb_data(data_dir):
    """
    Loads the IMDB train/test datasets from a folder path.
    Input:
    data_dir: path to the "aclImdb" folder.
    
    Returns:
    train/test datasets as pandas dataframs.
    """
    
    data = {}
    for split in ["train", "test"]:
        data[split] = []
        for sentiment in ["neg", "pos"]:
            score = 1 if sentiment == "pos" else 0
            
            path = os.path.join(data_dir, split, sentiment)
            
            file_names = os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[split].append([review,score])
    
    np.random.shuffle(data["train"])
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment']
                                )
    np.random.shuffle(data["test"])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text','sentiment']
                               )
    return data["train"], data["test"]

In [12]:
train_data, test_data = load_train_test_imdb_data(data_dir="/home/conan/Deep_learning/Data_set/aclImdb")

In [14]:
train_data.head(5)

Unnamed: 0,text,sentiment
0,I saw this movie Sunday afternoon. I absolutel...,1
1,I love the 80s slasher flicks and I remember w...,0
2,The British Public School system did not evolv...,0
3,There are few movies that appear to provide en...,1
4,Keep in mind I'm a fan of the genre but have o...,1


### Modeling
Let’s train a linear SVM classifier for example.

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [17]:
# Transfrom each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor = clean_text
                             # ngram_range=(1, 2)
                            )
training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data['text'])

# Training
model = LinearSVC()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 83.68


### To realize how good this is, a recent state-of-the-art model can get around 95% accuracy. So this isn’t bad at all, but there is still some room for improvement.
improve:
- In particular, the longer the text, the higher its features (word counts) will be.
- using TF-IDF features 对应：CountVectorizer with a TfIdfVectorizer
- N-Gram是一种基于统计语言模型的算法。它的基本思想是将文本里面的内容按照字节进行大小为N的滑动窗口操作，形成了长度是N的字节片段序列。