# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer, word2vec, doc2vec) in classification of text documents

### Import all the necessary libraries

In [1]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
import pandas as pd


### Fetch documents for these 2 categories

In [2]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [3]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


In [4]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
print(f"{len(train_data.filenames)} documents")
print(f"{len(train_data.target_names)} categories")
print()

857 documents
2 categories



In [5]:
type(train_data.data)

list

In [6]:
train_data.target_names

['alt.atheism', 'talk.religion.misc']

In [7]:
train_df = pd.DataFrame([train_data.data, train_data.target.tolist()]).T
train_df.columns = ['text', 'target']
train_df.head(20)

Unnamed: 0,text,target
0,"Benedikt Rosenau writes, with great authority:...",0
1,\n[...stuff deleted...]\n\nComputers are an ex...,0
2,\n[deletia]\n\n\n In the deletions somewh...,0
3,...\n\n\tBefore or after his kids were shot?\n...,1
4,\n,1
5,"\n(Deletion)\n \nNo cookies, Charlie. The cla...",0
6,\n(Deletion)\n \nNo it in the way it is usual...,0
7,\n\n[...stiff deleted...]\n\n\n[...stiff delet...,1
8,(excess stuff deleted...)\n\n \n\nI know of...,0
9,[deletions]\n[deletions]\n\n As you have prese...,0


In [8]:
test_data = fetch_20newsgroups(subset='test',
                                    remove=('headers', 'footers', 'quotes'),
                                    categories=categories)
len(test_data.target)

570

In [9]:
test_df = pd.DataFrame([test_data.data, test_data.target.tolist()]).T
test_df.columns = ['text', 'target']
test_df.head(20)

Unnamed: 0,text,target
0,\nThrow away the FAQ. We can all just ask Mr...,0
1,": At the risk of beginning a cascade, I'll sta...",0
2,\n: But how do we know that you're representin...,0
3,"\n Thanks for the etymology lesson, but I act...",1
4,\n\n\n\nI think this should be illuminating to...,0
5,\n\n\n\n\nBzzt. Thank you for playing.\n\nYou'...,1
6,\n\nNo. Zeno's paradox is resolved by showing...,1
7,\nYou can tell your friend from me that I was ...,0
8,"\nJesus did and so do I.\n\nPeace be with you,",1
9,In the King James Version of the Bible there a...,1


### Preprocess the text

In [10]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.utils import simple_preprocess

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(["github", "action", "pull", "request"])

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def preprocess(df):
    
    def remove_urls(text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub(r'', text)

    # Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
    def remove_emoji(text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def remove_html(text):
        html_pattern = re.compile('<.*?>')
        return html_pattern.sub(r'', text)

    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\]^_`{|}~'))

    def remove_digits(text):
        return text.translate(str.maketrans('', '', string.digits))

    def lemmatize_words(text):
        pos_tagged_text = nltk.pos_tag(text.split())
        return " ".join(
            [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in STOPWORDS])
    

    df = df.str.lower()  # lowercase
    df = df.apply(lambda text: remove_urls(text))  # remove urls
    df = df.apply(lambda text: remove_emoji(text))  # remove emojis
    df = df.apply(lambda text: remove_html(text))  # remove html
    df = df.apply(lambda text: remove_punctuation(text))  # remove punctuation
    #df = df.apply(lambda text: remove_digits(text))  # remove digits
    df = df.apply(lambda text: remove_stopwords(text))  # remove stopwords
    df = df.apply(lambda text: lemmatize_words(text))  # lemmatize
    #df = df.apply(lambda text: nltk.word_tokenize(text))

    return df

In [12]:
train_processed = preprocess(train_df["text"])
train_corpus = train_processed.tolist()
train_corpus

['benedikt rosenau write great authority contradictory property language correct thing define contradictory language exist object definition reality amend thing describe contradictory language exist weve come something plainly false failure description merely failures description im objectivist remember',
 'stuff delete computer excellent exampleof evolution without creator create computer create sand go silicon go integrated circuit go processor board take thing put together interesting way like plant create oxygen use light photosynthesis much big leap talk something create everything nothing find unfathomable resort believe creator much simpler alternative exist simply incapable understanding beginning -- even beginning thats ok present keep perfectly busy',
 'deletia deletion somewhere mention something chop hand punishment theft saudi arabia assume wouldnt know assume do people fit requirement muslim find highly likely would please try convince bobby mozumder muslim chop people ha

In [14]:
test_processed = preprocess(test_df["text"])

### Define text feature extractors 

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from gensim.models import Word2Vec
from gensim.models import Doc2Vec
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [24]:
#takes processed Series as input
def TF_IDF(train, test):
    train_corpus = train.tolist()
    test_corpus = test.tolist()
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_corpus).toarray()
    X_test = vectorizer.transform(test_corpus).toarray()
    return X_train, X_test

In [17]:
def CountVect(train, test):
    train_corpus = train.tolist()
    test_corpus = test.tolist()
    count_vect = CountVectorizer()
    X_train = count_vect.fit_transform(train_corpus)
    X_test = count_vect.transform(test_corpus)
    #print(X_train.shape[0])
    return X_train, X_test

In [55]:
def word2vect(train, test):
    
    #tokenize first
    train_corpus = train.apply(lambda text: nltk.word_tokenize(text))
    test_corpus = test.apply(lambda text: nltk.word_tokenize(text))
    corpus = train_corpus.tolist() + test_corpus.tolist()
    #len(corpus)
    model = Word2Vec()
    model.build_vocab(corpus)  # prepare the model vocabulary
    model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)  # train word vectors

    #model.wv.index_to_key
    #model.wv.most_similar("king")

    words = set(model.wv.index_to_key)
    
    X_train = train_corpus
    X_test = test_corpus
    
    arr=np.array([model.wv[i] for i in words])
    X_train_vect = np.array([arr for ls in X_train])
    X_test_vect = np.array([arr for ls in X_test])  

#     X_train_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_train])
#     X_test_vect = np.array([np.array([model.wv[i] for i in ls if i in words]) for ls in X_test])  
           
    #Why is the length of the sentence different than the length of the sentence vector?
    # for i, v in enumerate(X_train_vect):
    #     print(len(X_train.iloc[i]), len(v))      

    # Compute sentence vectors by averaging the word vectors
           
    X_train_vect_avg = []
    for v in X_train_vect:
        if v.size:
            X_train_vect_avg.append(v.mean(axis=0))
        else:
            X_train_vect_avg.append(np.zeros(100, dtype=float))

    X_test_vect_avg = []
    for v in X_test_vect:
        if v.size:
            X_test_vect_avg.append(v.mean(axis=0))
        else:
            X_test_vect_avg.append(np.zeros(100, dtype=float))
    #for negaitve values
    scaler = MinMaxScaler()
    X_norm = scaler.fit_transform(X_train_vect_avg)
    X_test = scaler.transform(X_test_vect_avg)
    return X_norm, X_test

In [20]:
def doc2vect(train, test):
    
    #tokenize first
    train_corpus = train.apply(lambda text: nltk.word_tokenize(text))
    test_corpus = test.apply(lambda text: nltk.word_tokenize(text))
    corpus = train_corpus.tolist() + test_corpus.tolist()
    
    import multiprocessing
    num_cores = multiprocessing.cpu_count()
    from gensim.models.doc2vec import TaggedDocument,Doc2Vec

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
    model = Doc2Vec(documents,vector_size=5, window=2, min_count=1, workers=num_cores)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count,epochs=10)

    def vector_for_learning(model, input_docs):
        sents = input_docs
        targets, feature_vectors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
        return targets, feature_vectors

    train_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_corpus)]
    test_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(test_corpus)]

    _, X_train = vector_for_learning(model, train_documents)
    _, X_test = vector_for_learning(model, test_documents)
     #for negaitve values
    scaler = MinMaxScaler()
    X_norm = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_norm, X_test

### Benchmarking with classifiers and feature extractors

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [56]:
feat_extractors = [TF_IDF, CountVect, word2vect, doc2vect]
classifiers = [MultinomialNB, LogisticRegression,svm.SVC, DecisionTreeClassifier]
for clf  in classifiers:
    results =[]
    for fn in feat_extractors:
        X_train, X_test = fn(train_processed, test_processed)
        model = clf()
        model.fit(X_train, train_data.target)
        pred = model.predict(X_test)
        score = accuracy_score(test_data.target, pred)
        #print(f"{clf.__name__} | {fn.__name__}  | {round(score, 3)} ")
        results.append(score)

    for fn, acc in zip(feat_extractors, results):
        print(f"For classifier: {clf.__name__} | feature extractor: {fn.__name__}  | Accuracy: {round(acc, 3)} ")
    print("\n")



For classifier: MultinomialNB | feature extractor: TF_IDF  | Accuracy: 0.633 
For classifier: MultinomialNB | feature extractor: CountVect  | Accuracy: 0.712 
For classifier: MultinomialNB | feature extractor: word2vect  | Accuracy: 0.56 
For classifier: MultinomialNB | feature extractor: doc2vect  | Accuracy: 0.561 


For classifier: LogisticRegression | feature extractor: TF_IDF  | Accuracy: 0.691 
For classifier: LogisticRegression | feature extractor: CountVect  | Accuracy: 0.668 
For classifier: LogisticRegression | feature extractor: word2vect  | Accuracy: 0.56 
For classifier: LogisticRegression | feature extractor: doc2vect  | Accuracy: 0.596 


For classifier: SVC | feature extractor: TF_IDF  | Accuracy: 0.684 
For classifier: SVC | feature extractor: CountVect  | Accuracy: 0.586 
For classifier: SVC | feature extractor: word2vect  | Accuracy: 0.56 
For classifier: SVC | feature extractor: doc2vect  | Accuracy: 0.596 


For classifier: DecisionTreeClassifier | feature extracto