In [1]:
import numpy as np
import pandas as pd
import os
import re
import argparse
import random
from glob import glob

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
import pickle
from tqdm.auto import tqdm
# from tqdm.notebook import tqdm
tqdm.pandas()


## Loading Data

In [6]:
def read_folder(path_to_folder):
    """ RUN ONCE to read the dataset into numpy arrays"""
    for split in ['train', 'test']:
        samples = []
        for class_label in ['pos', 'neg']:
            fnames = glob(os.path.join(path_to_folder, split, class_label) + '/*.txt')
            for fname in fnames:
                with open(fname) as fin:
                    line = fin.readline()
                    samples.append((line, 1 if class_label == 'pos' else 0))
        random.shuffle(samples)
        samples = np.array(samples)
        print(samples.shape)
        out_fname = 'train' if split == 'train' else 'test'
        np.save(os.path.join(path_to_folder, out_fname + '.npy'), samples)

read_folder('aclImdb')

(25000, 2)
(25000, 2)


In [2]:
train = np.load('aclImdb/train.npy')
train_corpus = [s[0] for s in train]
y = [s[1] for s in train]

test = np.load('aclImdb/test.npy')
test_corpus = [s[0] for s in test]
y_test = [s[1] for s in test]

## Processing

#### Cleaning

In [3]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(train_corpus)
reviews_test_clean = preprocess_reviews(test_corpus)

#### Removing stop words

In [4]:
# nltk.download('stopwords')
english_stop_words = stopwords.words('english')

def remove_stop_word(corpus):
    cleaned = []
    for review in corpus:
        cleaned.append(
            ' '.join([word for word in review.split() 
                                 if word not in english_stop_words]))
    return cleaned

removed_train = remove_stop_word(reviews_train_clean)
removed_test = remove_stop_word(reviews_test_clean)

#### Stemming

In [5]:
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(removed_train)
stemmed_reviews_test = get_stemmed_text(removed_test)

#### Lemmatization

In [6]:
# nltk.download('wordnet')
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

train = get_lemmatized_text(stemmed_reviews_train)
test = get_lemmatized_text(stemmed_reviews_test)

In [7]:
# Tools
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def sentiment(review, vectorizer, model):
    # Preprocessing
    cleaned = REPLACE_NO_SPACE.sub(NO_SPACE, review.lower())
    cleaned = REPLACE_WITH_SPACE.sub(SPACE, cleaned)
    removed = ' '.join([word for word in review.split() if word not in english_stop_words])
    stemmed = ' '.join([stemmer.stem(word) for word in removed.split()])
    lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in stemmed.split()])
    
    # Predicting
    X = vectorizer.transform([lemmatized])
    y = model.predict(X)
    return y

## TF-IDF Representation 

In [8]:
def tfidf(train, test):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(train)
    X = tfidf_vectorizer.transform(train)
    X_test = tfidf_vectorizer.transform(test)
    return X, X_test, tfidf_vectorizer

X, X_test, vectorizer = tfidf(train, test)

## Classification

#### Training SVM

In [35]:
def training(X, y):
    model = LinearSVC(C=0.1)
#     model = LogisticRegression(C=1)
    model.fit(X, y)
    return model

model = training(X, y)

#### Testing

In [36]:
print ("Final Accuracy: %s" % accuracy_score(y_test, model.predict(X_test)))

Final Accuracy: 0.87932


In [38]:
def save_models(model, vectorizer):
    pickle.dump(model, open("sentiment_analyzer_svm.p", "wb"))
    pickle.dump(vectorizer, open("vectorizer.p", "wb"))
def load_models():
    model = pickle.load(open("sentiment_analyzer_svm.p", "rb"))
    vectorizer = pickle.load(open("vectorizer.p", "rb"))
    return model, vectorizer

save_models(model, vectorizer)
model, vectorizer = load_models()

## Predicting sentiment of News Articles

In [39]:
def predict_articles():
    for i in range(2016, 2021):
        df = pd.read_csv('news-{}.csv'.format(i))
        # pipeline
        df['sentiment-svm'] = df.progress_apply(lambda x: sentiment(x['article'], vectorizer, model), axis=1)
        df.to_csv('news-'+str(i)+'.csv', index=False)
    
predict_articles()

HBox(children=(IntProgress(value=0, max=595028), HTML(value='')))




HBox(children=(IntProgress(value=0, max=635671), HTML(value='')))




HBox(children=(IntProgress(value=0, max=547129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=564579), HTML(value='')))




HBox(children=(IntProgress(value=0, max=189978), HTML(value='')))


