In [1]:
import glob
import numpy as np

In [2]:
#Data preprocessing
#Loading data
def get_data(path_neg, path_pos):
    neg_data = []
    pos_data = []
 
    files_neg = glob.glob(path_neg)
    files_pos = glob.glob(path_pos)
 
    for neg in files_neg:
        with open(neg, 'r', encoding='utf-8') as neg_f:
            neg_data.append(neg_f.readline())
 
    for pos in files_pos:
        with open(pos, 'r', encoding='utf-8') as pos_f:
            pos_data.append(pos_f.readline())
 
    neg_label = np.zeros(len(neg_data)).tolist()
    pos_label = np.ones(len(pos_data)).tolist()
 
    corpus = neg_data + pos_data
    labels = neg_label + pos_label
 
    return corpus, labels

In [6]:
import re
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\songy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
#Normalize and preprocess data
def normalize(corpus):
    normalized_corpus = []
 
    for text in corpus:
        # Convert to lowercase
        text = text.lower().strip()
 
        # Remove symbol
        text = re.sub(r"<br />", r" ", text)
        text = re.sub(' +', ' ', text)
        text = re.sub(r'(\W)(?=\1)', '', text)
        text = re.sub(r"([.!?])", r" \1", text)
        text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)
 
        # Separate words and remove punctuation marks
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
 
        # Remove stop words
        stopword = stopwords.words('english')
        filtered_tokens = [token for token in tokens if token not in stopword]
 
        # Regroup
        filtered_text = ' '.join(filtered_tokens)
        normalized_corpus.append(filtered_text)
 
    return normalized_corpus

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#Extract features
#Bag of Words model
def bow_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features
 
#TF-IDF model
def tfidf_extractor(corpus, ngram_range=(1, 1)):
    vectorizer = TfidfVectorizer(min_df=1, norm='l2', smooth_idf=True, use_idf=True, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', max_iter=100)
lr = LogisticRegression(solver='liblinear')

In [11]:
#Training classifier
def train_predict_evaluate_model(classifier,
                                 train_features, train_labels,
                                 test_features, test_labels):
    # Training
    classifier.fit(train_features, train_labels)
    # Predict results on the test set
    predictions = classifier.predict(test_features)
 
    return predictions

In [12]:
#Model evaluation
from sklearn import metrics
import numpy as np
 
#Evaluation index
def get_metrics(true_labels, predicted_labels):
    print('Accuracy:', np.round(
        metrics.accuracy_score(true_labels,
                               predicted_labels),
        2))
    print('Precision:', np.round(
        metrics.precision_score(true_labels,
                                predicted_labels,
                                average='weighted'),
        2))
    print('Recall rate:', np.round(
        metrics.recall_score(true_labels,
                             predicted_labels,
                             average='weighted'),
        2))
    print('F-measure:', np.round(
        metrics.f1_score(true_labels,
                         predicted_labels,
                         average='weighted'),
        2))

In [13]:
#Main function 
if __name__ == "__main__":
    train_corpus, train_labels = get_data('C:/Users/songy/Documents/2020spring data/aclImdb/train/neg/*.txt', 'C:/Users/songy/Documents/2020spring data/aclImdb/train/pos/*.txt')
    test_corpus, test_labels = get_data('C:/Users/songy/Documents/2020spring data/aclImdb/test/neg/*.txt', 'C:/Users/songy/Documents/2020spring data/aclImdb/test/pos/*.txt')
 
    norm_train_corpus = normalize(train_corpus)
    norm_test_corpus = normalize(test_corpus)
 
    # Bag of Words model features
    bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
    bow_test_features = bow_vectorizer.transform(norm_test_corpus)
 
    # tfidf model features
    tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
    tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)
 
    # Import classifier
    svm = SGDClassifier(loss='hinge', max_iter=100)
    lr = LogisticRegression(solver='liblinear')
 
    # Logistic regression model based on features of Bag of Words model
    print("Logistic regression model based on features of Bag of Words model")
    lr_bow_predictions = train_predict_evaluate_model(classifier=lr,
                                                      train_features=bow_train_features,
                                                      train_labels=train_labels,
                                                      test_features=bow_test_features,
                                                      test_labels=test_labels)
    results1 = get_metrics(test_labels,lr_bow_predictions)
    results1
 
    # SVM model based on Bag of features of Bag of Words model
    print("SVM model based on Bag of features of Bag of Words model")
    svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                                       train_features=bow_train_features,
                                                       train_labels=train_labels,
                                                       test_features=bow_test_features,
                                                       test_labels=test_labels)
    results2 = get_metrics(test_labels,svm_bow_predictions)
    results2
    
    # Logistic regression model based on features of tfidf model
    print("Logistic regression model based on features of tfidf model")
    lr_tfidf_predictions = train_predict_evaluate_model(classifier=lr,
                                                        train_features=tfidf_train_features,
                                                        train_labels=train_labels,
                                                        test_features=tfidf_test_features,
                                                        test_labels=test_labels)
    results3 = get_metrics(test_labels,lr_tfidf_predictions)
    results3 
    # SVM model based on Bag of features of tfidf model
    print("SVM model based on Bag of features of tfidf model")
    svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                         train_features=tfidf_train_features,
                                                         train_labels=train_labels,
                                                         test_features=tfidf_test_features,
                                                         test_labels=test_labels)
    results4 = get_metrics(test_labels,svm_tfidf_predictions)
    results4 

Logistic regression model based on features of Bag of Words model
Accuracy: 0.86
Precision: 0.86
Recall rate: 0.86
F-measure: 0.86
SVM model based on Bag of features of Bag of Words model
Accuracy: 0.85
Precision: 0.85
Recall rate: 0.85
F-measure: 0.85
Logistic regression model based on features of tfidf model
Accuracy: 0.88
Precision: 0.88
Recall rate: 0.88
F-measure: 0.88
SVM model based on Bag of features of tfidf model
Accuracy: 0.88
Precision: 0.88
Recall rate: 0.88
F-measure: 0.88
