[Reference: Part 1](https://towardsdatascience.com/introduction-to-nlp-part-1-preprocessing-text-in-python-8f007d44ca96) <br>
[Reference: Part 2](https://towardsdatascience.com/introduction-to-nlp-part-2-difference-between-lemmatisation-and-stemming-3789be1c55bc) <br>
[Reference: Part 3](https://towardsdatascience.com/introduction-to-nlp-part-3-tf-idf-explained-cedb1fc1f7dc)<br>
[Reference: Part 4](https://towardsdatascience.com/introduction-to-nlp-part-4-supervised-text-classification-model-in-python-96e9709b4267#%20https://stackoverflow.com/questions/46109166/converting-categorizedplaintextcorpusreader-into-dataframe)



In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('movie_reviews')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

# Part 1: Preprocessing text in Python
- Tokenise
- Normalise
- Remove stopwords
- Count vectorise
- Transform to tf-idf representation

In [3]:
part1 = """We are gathered here today on this joyous occasion to celebrate the special love that Monica and Chandler share. It is a love based on giving and receiving as well as having and sharing. And the love that they give and have is shared and received. And
through this having and giving and sharing and receiving, we too can share and love and have... and receive."""
part2 = """When I think of the love these two givers and receivers share I cannot help but envy the lifetime ahead of having and loving and giving and receiving."""

In [4]:
# Create a dataframe
X_train = pd.DataFrame([part1, part2], columns=['speech'])

In [5]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

STEP 1: TOKENISE

In [6]:
# Import module
from nltk.tokenize import RegexpTokenizer

# Create an instance of RegexpTokenizer for alphanumeric tokens
tokeniser = RegexpTokenizer(r'\w+')

# Tokenise 'part1' string
tokens = tokeniser.tokenize(part1)
print(tokens)

['We', 'are', 'gathered', 'here', 'today', 'on', 'this', 'joyous', 'occasion', 'to', 'celebrate', 'the', 'special', 'love', 'that', 'Monica', 'and', 'Chandler', 'share', 'It', 'is', 'a', 'love', 'based', 'on', 'giving', 'and', 'receiving', 'as', 'well', 'as', 'having', 'and', 'sharing', 'And', 'the', 'love', 'that', 'they', 'give', 'and', 'have', 'is', 'shared', 'and', 'received', 'And', 'through', 'this', 'having', 'and', 'giving', 'and', 'sharing', 'and', 'receiving', 'we', 'too', 'can', 'share', 'and', 'love', 'and', 'have', 'and', 'receive']


STEP 2. NORMALISE



In [7]:
# Import module
from nltk.stem import WordNetLemmatizer

# Create an instance of WordNetLemmatizer
lemmatiser = WordNetLemmatizer()

# Lowercase and lemmatise tokens
lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
print(lemmas)

['we', 'be', 'gather', 'here', 'today', 'on', 'this', 'joyous', 'occasion', 'to', 'celebrate', 'the', 'special', 'love', 'that', 'monica', 'and', 'chandler', 'share', 'it', 'be', 'a', 'love', 'base', 'on', 'give', 'and', 'receive', 'as', 'well', 'as', 'have', 'and', 'share', 'and', 'the', 'love', 'that', 'they', 'give', 'and', 'have', 'be', 'share', 'and', 'receive', 'and', 'through', 'this', 'have', 'and', 'give', 'and', 'share', 'and', 'receive', 'we', 'too', 'can', 'share', 'and', 'love', 'and', 'have', 'and', 'receive']


In [8]:
# Check how many words we have
len(lemmas)

66

STEP 3. REMOVE STOPWORDS

In [9]:
# Import module
from nltk.corpus import stopwords

# Check out how many stopwords there are 
print(len(stopwords.words('english')))

# See first 5 stopwords
stopwords.words('english')[:5]

179


['i', 'me', 'my', 'myself', 'we']

In [10]:
keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
print(keywords)

['gather', 'today', 'joyous', 'occasion', 'celebrate', 'special', 'love', 'monica', 'chandler', 'share', 'love', 'base', 'give', 'receive', 'well', 'share', 'love', 'give', 'share', 'receive', 'give', 'share', 'receive', 'share', 'love', 'receive']


In [11]:
# Check how many words we have
len(keywords)

26

STEP 4. COUNT VECTORISE

In [12]:
{word: keywords.count(word) for word in set(keywords)}

{'base': 1,
 'celebrate': 1,
 'chandler': 1,
 'gather': 1,
 'give': 3,
 'joyous': 1,
 'love': 4,
 'monica': 1,
 'occasion': 1,
 'receive': 4,
 'share': 5,
 'special': 1,
 'today': 1,
 'well': 1}

In [13]:
# Import module
from sklearn.feature_extraction.text import CountVectorizer
# Create an instance of CountfVectorizer
vectoriser = CountVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train = vectoriser.fit_transform(X_train['speech'])

In [14]:
# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)# Save mapping on which index refers to which terms
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

Unnamed: 0,ahead,base,cannot,celebrate,chandler,envy,gather,give,givers,help,joyous,lifetime,love,monica,occasion,receive,receivers,share,special,think,today,two,well
0,0,1,0,1,1,0,1,3,0,0,1,0,4,1,1,4,0,5,1,0,1,0,1
1,1,0,1,0,0,1,0,1,1,1,0,1,2,0,0,1,1,1,0,1,0,1,0


STEP 5. TRANSFORM TO TF-IDF REPRESENTATION

In [15]:
# Import module
from sklearn.feature_extraction.text import TfidfTransformer

# Create an instance of TfidfTransformer
transformer = TfidfTransformer()

# Fit to the data and transform to tf-idf
X_train = pd.DataFrame(transformer.fit_transform(X_train).toarray(), columns=X_train.columns)
X_train

Unnamed: 0,ahead,base,cannot,celebrate,chandler,envy,gather,give,givers,help,joyous,lifetime,love,monica,occasion,receive,receivers,share,special,think,today,two,well
0,0.0,0.151773,0.0,0.151773,0.151773,0.0,0.151773,0.323963,0.0,0.0,0.151773,0.0,0.431951,0.151773,0.151773,0.431951,0.0,0.539939,0.151773,0.0,0.151773,0.0,0.151773
1,0.28235,0.0,0.28235,0.0,0.0,0.28235,0.0,0.200894,0.28235,0.28235,0.0,0.28235,0.401788,0.0,0.0,0.200894,0.28235,0.200894,0.0,0.28235,0.0,0.28235,0.0


In [16]:
# Create a dataframe
X_train = pd.DataFrame([part1, part2], columns=['speech'])

In [17]:
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)

# Fit to the data and transform to feature matrix
X_train = vectoriser.fit_transform(X_train['speech'])

# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)

# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

Unnamed: 0,ahead,base,cannot,celebrate,chandler,envy,gather,give,givers,help,joyous,lifetime,love,monica,occasion,receive,receivers,share,special,think,today,two,well
0,0.0,0.151773,0.0,0.151773,0.151773,0.0,0.151773,0.323963,0.0,0.0,0.151773,0.0,0.431951,0.151773,0.151773,0.431951,0.0,0.539939,0.151773,0.0,0.151773,0.0,0.151773
1,0.28235,0.0,0.28235,0.0,0.0,0.28235,0.0,0.200894,0.28235,0.28235,0.0,0.28235,0.401788,0.0,0.0,0.200894,0.28235,0.200894,0.0,0.28235,0.0,0.28235,0.0


More efficient way

In [18]:
# Create a dataframe
X_train = pd.DataFrame([part1, part2], columns=['speech'])

In [19]:
# Import module
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)

# Fit to the data and transform to tf-idf
X_train = vectoriser.fit_transform(X_train['speech'])

# Part2: Difference between lemmatisation and stemming

In [20]:
# Import packages
import pandas as pd
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

# Instantiate stemmers and lemmatiser
porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatiser = WordNetLemmatizer()

# Create function that normalises text using all three techniques
def normalise_text(words):
    """Stem and lemmatise each word in a list. Return output in a dataframe."""
    normalised_text = pd.DataFrame(index=words, columns=['Porter', 'Lancaster', 'Lemmatiser'])
    for word in words:
        normalised_text.loc[word,'Porter'] = porter.stem(word)
        normalised_text.loc[word,'Lancaster'] = lancaster.stem(word)
        normalised_text.loc[word,'Lemmatiser'] = lemmatiser.lemmatize(word, pos='v')
    return normalised_text

In [21]:
normalise_text(["stemming", "lemmatisation"])

Unnamed: 0,Porter,Lancaster,Lemmatiser
stemming,stem,stem,stem
lemmatisation,lemmatis,lem,lemmatisation


# Part 3: TF-IDF explained

In [22]:
d1 = 'I thought, I thought of thinking of thanking you for the gift'
d2 = 'She was thinking of going to go and get you a GIFT!'

In [23]:
# Create a dataframe
X_train = pd.DataFrame({'text': [d1, d2]})

In [24]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [25]:
d1 = ['think', 'think', 'think', 'thank,' 'gift']
d2 = ['think', 'go', 'go', 'get', 'gift']

In [26]:
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)

# Fit to the data and transform to feature matrix
X_train = vectoriser.fit_transform(X_train['text'])

# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)

# Save mapping on which index refers to which words
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}

# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train

Unnamed: 0,get,gift,go,thank,think
0,0.0,0.288972,0.0,0.40614,0.866917
1,0.407824,0.29017,0.815648,0.0,0.29017


In [27]:
d3 = "He thinks he will go!"
d4 = "They don’t know what to buy!"

# Create dataframe
X_test = pd.DataFrame({'text': [d3, d4]})

# Transform to feature matrix
X_test = vectoriser.transform(X_test['text'])

# Convert sparse matrix to dataframe
X_test = pd.DataFrame.sparse.from_spmatrix(X_test)
# Add column names to make it more readible
for col in X_test.columns:
    X_test.rename(columns={col: col_map[col]}, inplace=True)
X_test

Unnamed: 0,get,gift,go,thank,think
0,0.0,0.0,0.814802,0.0,0.579739
1,0.0,0.0,0.0,0.0,0.0


In [28]:
d3 = ['think', 'go'] # vectoritiser is familiar with these terms
d4 = ['know', 'buy'] # vectoritiser is not familiar with these terms

# Part4: Supervised text classification model in Python

## 1. Prepare Data

In [29]:
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split('/')
    reviews.append((tag, movie_reviews.raw(fileid)))
sample = pd.DataFrame(reviews, columns=['target', 'document'])
print(f'Dimensions: {sample.shape}')
sample.head()

Dimensions: (2000, 2)


Unnamed: 0,target,document
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [30]:
sample['target'].value_counts()

pos    1000
neg    1000
Name: target, dtype: int64

In [31]:
sample['target'] = np.where(sample['target']=='pos', 1, 0)
sample['target'].value_counts()

1    1000
0    1000
Name: target, dtype: int64

## 2. Split Data

In [32]:
X_train, X_test, y_train, y_test = train_test_split(sample['document'], sample['target'], test_size=0.3, random_state=123)

print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')

# Check out target distribution
print(y_train.value_counts())
print(y_test.value_counts())

Train dimensions: ((1400,), (1400,))
Test dimensions: ((600,), (600,))
1    700
0    700
Name: target, dtype: int64
1    300
0    300
Name: target, dtype: int64


In [33]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords
    
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)

# Fit to the data and transform to feature matrix
X_train_tfidf = vectoriser.fit_transform(X_train)
X_train_tfidf.shape

(1400, 27676)

## 3. Let's do modelling

Let’s build a baseline model using Stochastic Gradient Descent Classifier. I have chosen this classifier because it is fast and works well with sparse matrix. Using 5-fold cross validation, let’s fit the model to the data and evaluate it

In [34]:
sgd_clf = SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5)

print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

[0.82857143 0.85       0.84285714 0.81785714 0.81428571]
Accuracy: 0.83 (+/- 0.03)


In [35]:
sgf_clf_pred = cross_val_predict(sgd_clf, X_train_tfidf, y_train, cv=5)
print(confusion_matrix(y_train, sgf_clf_pred))

[[580 120]
 [117 583]]


In [36]:
grid = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(X_train_tfidf, y_train)
search.best_params_

{'early_stopping': False,
 'fit_intercept': False,
 'loss': 'log',
 'penalty': 'l1'}

In [37]:
grid_sgd_clf_scores = cross_val_score(search.best_estimator_, X_train_tfidf, y_train, cv=5)
print(grid_sgd_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (grid_sgd_clf_scores.mean(), grid_sgd_clf_scores.std() * 2))

[0.85       0.85714286 0.83571429 0.84285714 0.82857143]
Accuracy: 0.84 (+/- 0.02)


## 4. Finalize the model

In [38]:
pipe = Pipeline([('vectoriser', vectoriser),
                 ('classifier', search.best_estimator_)])
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectoriser',
                 TfidfVectorizer(analyzer=<function preprocess_text at 0x7f0377222d90>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents...
                ('classifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=False, l1_ratio=0.15,
                               learning_rate='optimal', loss='log',
                               max_iter=1000, 

In [39]:
y_test_pred = pipe.predict(X_test)
print("Accuracy: %0.2f" % (accuracy_score(y_test, y_test_pred)))
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.85
[[249  51]
 [ 37 263]]
