### Outlook
Our *Test score* for IMDB reviews data: **0.89736** 

*Test score* for 20 Newsgroup: **0.699196176314392**

### Packages install

In [None]:
import re
import nltk
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.naive_bayes import MultinomialNB
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
from spellchecker import SpellChecker # pip install pyspellchecker
from nltk.stem import WordNetLemmatizer

### Data: 20 newsgroup

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', remove=(['headers', 'footers', 'quotes']), shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', remove=(['headers', 'footers', 'quotes']), shuffle=True, random_state=42)

### Data: IMDB reviews

In [None]:
from sklearn.datasets import load_files
# Change the path to your dir
imdb_train = load_files(container_path="......\\551A1\\aclImdb_v1\\aclImdb\\train", categories=["pos", "neg"], description="IMAD review, Train")
imdb_test= load_files(container_path="......\\551A1\\aclImdb_v1\\aclImdb\\test",categories=["pos", "neg"], description="IMAD review, Test")

### Data preprocessing

For **20 newsgroup** data, preprocessing procedure is:
1. Remove all non-words
2. Transform the review in lower case
3. Remove all stop words
4. Perform stemming/lemmating
5. Check and correct spelling

For **IMDB reviews** data, preprocessing procedure is:
1. Remove all non-words
2. Transform the review in lower case
3. Remove all stop words
4. Perform stemming/lemmating

The reason why IMDB reviews data preprocessing procedure is lack of check and correct spelling is that it takes more than 10 hours to train with 12 cores by having it in the procedure, which is not applicable for the real-world scenario 

In [None]:
def correct_spellings(word):
    """
    Check misspelling words and convert it to its correct format
    """
    spell = SpellChecker()
    if spell.unknown(word) == set():
        return word
    else:
        return spell.correction(word)

def clean(texts, algo):
    """
    Receives a raw review and clean it using the following steps:
    1. Remove all non-words
    2. Transform the review in lower case
    3. Remove all stop words
    4. Perform stemming

    Args:
        review: the review that iwill be cleaned
    Returns:
        a clean review using the mentioned steps above.
    Cite Github
    """
    texts = re.sub("[^A-Za-z]", " ", texts)
    texts = texts.lower()
    texts = word_tokenize(texts)
    stopword = set(stopwords.words("english"))
    if algo == "stemmer":
        wordAlgo = PorterStemmer()
        texts = [(wordAlgo.stem(word)) for word in texts if word not in stopword]
    else:
        wordAlgo = WordNetLemmatizer()
        texts = [(wordAlgo.lemmatize(word)) for word in texts if word not in stopword]
    texts = " ".join(texts)
    return texts


def check_count(data):
    """
    Check for each words counts for removal
    """
    cnt = Counter()
    for text in data:
        for word in text.split():
            cnt[word] += 1

    return cnt.most_common()


def remove_freqwords(text, num_removal):
    """
    Remove the most frequent words, such as "I", "that", which has a limited
    importance to the model
    """
    counts = check_count(text)
    FREQWORDS = set([w for (w, wc) in counts[0:num_removal]])
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

def remove_rarewords(text, num_removal):
    """
    Remove the least frequent words, such as "hmmmmm", "yoooooo", which has a limited
    importance to the model
    """
    counts = check_count(text)
    RAREWORDS = set([w for (w, wc) in counts[:-num_removal-1:-1] if wc <= 5])
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

### Preprocessing IMDB reviews data

This strictly follows that procedure mentioned above

In [None]:
import pandas as pd
from joblib import Parallel, delayed
#Preprocessing data
reviews_data = pd.DataFrame(imdb_train.data, columns = ["review"])
reviews_train = Parallel(n_jobs=-1)(delayed(clean)(str(reviews_data.review[i]), "lemma") for i in range(0, len(reviews_data)))
reviews_data = pd.DataFrame(imdb_test.data, columns = ["review"])
reviews_test = Parallel(n_jobs=-1)(delayed(clean)(str(reviews_data.review[i]), "lemma") for i in range(0, len(reviews_data)))
#Remove freq words, such as 'I', 'that'
reviews_train = [remove_freqwords(text,5) for text in reviews_train]
#Remove rare words, such as  hmmmmmmmmm, yaaaaaaaap
reviews_train = [remove_rarewords(text,20000) for text in reviews_train]

### Preprocessing 20 Newsgroups data

In [None]:
news_data = pd.DataFrame(twenty_train.data, columns = ["review"])
# Choose lemma for better model performance
# Stemmer may cause the loss of ~5% accuracy
news_train = [clean(str(news_data.review[i]), "lemma") for i in range(0, len(reviews_data))]
news_data = pd.DataFrame(twenty_test.data, columns = ["review"])
news_test = [clean(str(news_data.review[i]), "lemma") for i in range(0, len(reviews_data))]

### Model Setup

Instead of assigning each paramters for different parameters, we would like to build a models() function to automatically call the parameters and pipelines.

In [None]:
def models(mod):
    '''
    PARAMETERS:
    -----------
    mod: str
    model name: rf, ada, svm, dt, logistic, knn, nb
    '''
    if mod == "rf":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier())])
        parameters = {
        'rf__n_estimators': [1000,5000],
        'rf__max_features':[10, 50],
        'rf__max_depth':[300,1000],
        'rf__n_jobs' : [5]
        }
    elif mod == "ada":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('ada', AdaBoostClassifier())])
        parameters = {
        'ada__n_estimators':[100,500,1000],
        'ada__learning_rate':[1.,0.1],
        'ada__base_estimator': [DecisionTreeClassifier()]}
    elif mod == "svm":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('svm', LinearSVC())])
        parameters = {
        'svm__penalty': ["l1", "l2"],
        'svm__loss':["hinge","squared_hinge"],
        'svm__multi_class':["ovr","crammer_singer"],
        'svm__C': [0.15, 0.77, 1, 10],
        'svm__max_iter':[5000,10000]
        }
    elif mod == "dt":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('dt', DecisionTreeClassifier())])
        parameters = {
        'dt__max_features':["auto", "log2"],
        'dt__splitter': ["best","random"]
        }
    elif mod == "logistic":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('logistic', LogisticRegression())])
        parameters = {
        'logistic__penalty':["l2"],
        'logistic__solver':["newton-cg","lbfgs","sag"],
        'logistic__C': [0.1, 0.5, 0.7, 1],
        'logistic__n_jobs':[-1],
        'logistic__max_iter':[100,500]
        }
    elif mod == "knn":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('knn', KNeighborsClassifier())])
        parameters = {
        'knn__n_neighbors':[5,10,15],
        'knn__weights':["uniform","distance"],
        'knn__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
        'knn__n_jobs':[-1]
        }
    elif mod == "nb":
        model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('nb', MultinomialNB())])
        parameters = {
        'nb__alpha':[1.0,0.0],
        'nb__fit_prior':[True, False]
        }
    return (model, parameters)

### Model training

The training function is designed for every models in our list and it will print necessary information that we used to validate the model, such as best parameters and best validation results

In [None]:
def train(train, target, model, cv, parameters = None):
    '''
    PARAMETERS:
    -----------
    train: input data

    target: input target

    model: predefined model class

    cv: int
    number of splits for CV

    GridSearch paramters: dict
    '''
    print("Model: {}".format(model))

    if parameters is None:
        model.fit(train, target)
        score = cross_validate(model, train, target, cv = cv)
        print("Cross Validation Scores: {}".format(score))
        print("Training accuracy: {}".format(model.score(train, target)))
    else:
        print("{} starts".format(model))
        model = GridSearchCV(model, parameters, cv=cv, n_jobs=11, verbose = 3)
        model.fit(train, target)
        print("Best params: {}".format(model.best_params_))
        print("Validation results: {}".format(model.cv_results_))
    return model, model.best_params, model.cv_results

### Training model

We take LinearSVC and IMDB data as our example to illustrate the training process

In [None]:
import time
model, paramters = models("svm")
# train() parameters setup
train = reviews_train
target = imdb_train.target
model = model
cv = 5 
parameters = paramters
start = time.time()
svm, best_params, cv_results = train(train, target, model, cv, parameters = paramters)

### Evaluating model

After training the model, we would like to evaluate the model performance

In [None]:
def evaluate(model, test, target):
    print("Test accuracy: {}".format(np.mean(model.predict(test) == target)))
    print("Classification report: {}".format(classification_report(target, model.predict(test))))
    return

In [None]:
evaluate(svm, reviews_test, imdb_test.target)

### Best Hyperparameters for Each Model

Training process is done, we would like to emphasize the best performed model for each data

In [None]:
best_params

### Grid Search Results

For now, after cleaning the words and grid search the hypterparameters, regarding both IMDB and 20 Newsgroup data LinearSVC() reaches the highest test accuracy across all models


* **IMDB** Best SVM parameters: {'svm__C': 0.1, 'svm__loss': 'hinge', 'svm__max_iter': 1000, 'svm__multi_class': 'ovr', 'svm__penalty': 'l2'}

    *Test score*: 0.8639



* **20 Newsgroup** Best SVM parameters: {'svm__C': 0.1, 'svm__loss': 'hinge', 'svm__max_iter': 2000, 'svm__multi_class': 'ovr', 'svm__penalty': 'l2'}

    *Test score*: 0.67950

### Best Model across all Best Models

We select the models that fit the two dataset best from more than hundreds models. Then we apply different CountVectorizer() parameters to seek better performance. After another grid search,for both IMDB and 20 Newsgroup data, LinearSVC() reaches the highest test accuracy across all models, 

* **IMDB** Best SVM parameters: {'svm__C': 10, 'svm__loss': 'hinge', 'svm__max_iter': 5000, 'svm__multi_class': 'ovr', 'svm__penalty': 'l2', 'vect__analyzer': 'word', 'vect__binary': True, 'vect__ngram_range': (1, 2), 'vect__strip_accents': 'unicode'}

    *Test score*: **0.89736** 



* **20 Newsgroup** Best SVM parameters: {'svm__C': 0.77, 'svm__loss': 'hinge', 'svm__max_iter': 5000, 'svm__multi_class': 'ovr', 'svm__penalty': 'l2', 'vect__analyzer': 'word', 'vect__binary': True, 'vect__ngram_range': (1, 1), 'vect__strip_accents': None}

    *Test score*: **0.699196176314392**

In [None]:
# Reassign parameters to Grid Search 
parameters = {'vect__ngram_range': [(1,1), (1,2), (2,2)],
              'vect__binary': [True],
              'vect__analyzer':['word'],
              'vect__strip_accents': ['unicode', None],
              'svm__C': [0.15, 0.77, 1, 10],
             'svm__loss': ['hinge'],
             'svm__max_iter': [5000],
             'svm__multi_class': ['ovr'],
             'svm__penalty': ['l2']}

### Conclusion

By far, we used the pipeline of NLP preprocesing techniques, machine learning classification model, grid search the best hyperparameters, and model selection to help our tuning and fitting. Both datasets are having the highest accuracy on SVM model, though their parameters are slightly different. 