# Tutorial - Text Mining - Classification - NLTK

We will predict the category of discussion posts in a newsgroup.

**The unit of analysis is a discussion post**

In [None]:
import pandas as pd
import numpy as np

In [None]:
news = pd.read_csv('news.csv')

In [None]:
news.head(5)

## Assign the "target" variable

This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

In [None]:
target = news['newsgroup']

## Assign the "text" (input) variable

In [None]:
# Check for missing values

news[['TEXT']].isna().sum()

In [None]:
input_data = news['TEXT']

## Split the data

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [None]:
train_set.shape, train_y.shape

In [None]:
test_set.shape, test_y.shape

## NLTK

In order to use NLTK, you first need to install it. You can start the Anaconda Prompt and enter the following to do so: `pip install nltk`

NLTK gives you more control over Keras Tokenizer and scikit-learn (but the idea is the same)

In [None]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
train_set

In [None]:
#Create a blank list

new_train = []


# For each row in train_set, we will read the text, tokenize it, remove stopwords, lemmatize it, 
# and save it to the new list

for text in train_set:
    text = re.sub(r'[!"#$%&()*+,-./:;<=>?[\]^_`{|}~]', ' ', text).lower()
        
    words= nltk.tokenize.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if len(w)>2 and w not in stopwords.words('english')]
        
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    new_train.append(' '.join(words))

In [None]:
# This will be a list of 417 items. Each item has the tokens

new_train

In [None]:
# Let's convert the original train_set to a dataframe

train_set_df = pd.DataFrame(train_set)

train_set_df['new_text'] = new_train

train_set_df

In [None]:
# Let's do the same for test data 

new_test = []

for text in test_set:
    text = re.sub(r'[!"#$%&()*+,-./:;<=>?[\]^_`{|}~]', ' ', text).lower()
        
    words= nltk.tokenize.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    words = [w for w in words if len(w)>2 and w not in stopwords.words('english')]
        
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    new_test.append(' '.join(words))



test_set_df = pd.DataFrame(test_set)

test_set_df['new_text'] = new_test

test_set_df

## Use Scikit-Learn to create the term-by-doc matrix

In [None]:
#TfidfVectorizer includes pre-processing, tokenization, filtering stop words
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(stop_words='english', max_features=500)

train_x_tfidf = tfidf_vect.fit_transform(train_set_df['new_text'])

In [None]:
# Perform the TfidfVectorizer transformation
# Be careful: We are using the train fit to transform the test data set. Otherwise, the test data 
# features will be very different and match the train set!!!

test_x_tfidf = tfidf_vect.transform(test_set_df['new_text'])


In [None]:
train_x_tfidf.shape, test_x_tfidf.shape

### We are not creating SVDs here. But you can if you want. It is a matter of preference. 
### Also, note that we limited the columns to 500 (by selecting the most commonly occurring terms)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 

from sklearn.metrics import accuracy_score

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1) 

rnd_clf.fit(train_x_tfidf, train_y)



## Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x_tfidf)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x_tfidf)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

## Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=100, tol=1e-3)


In [None]:
sgd_clf.fit(train_x_tfidf, train_y)

## Accuracy

In [None]:
#Train accuracy

train_y_pred = sgd_clf.predict(train_x_tfidf)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = sgd_clf.predict(test_x_tfidf)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)