## Data preprocess using Natural Language Toolkit

### Import libraries

In [1]:
import pandas as pd
import numpy as np
# module provides regular expression matching operations (https://www.w3schools.com/python/python_regex.asp)
import re
# module implements binary protocols for serializing and de-serializing a Python object
import pickle


import string

# (https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/)
# natural language toolkit (https://realpython.com/nltk-nlp-python/)
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# filter common words
from nltk.corpus import stopwords
# split text into words
from nltk.tokenize import word_tokenize
# nltk.download('averaged_perceptron_tagger')
# reduce words to their root by cutting common suffixes and prefixes (ex. goes -> goe)
from nltk.stem.porter import PorterStemmer
# reduce words to their root by transformin to initial form with ectual meaning (ex. goes -> go )
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# nltk.download('omw-1.4')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

# # Silent warnings
# import warnings
# warnings.filterwarnings('ignore')
# warnings.simplefilter('ignore')

In [2]:
raw_df = pd.read_csv('eda_news_dataset.csv', index_col=False)
raw_df.sample(3)

Unnamed: 0,text,target
18931,- The U.S. Drug Enforcement Administration’s ...,0
5264,"Voting rights expert, Ari Berman, sat down wit...",1
25409,- A behind-the-scenes congressional battle to...,0


In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38589 entries, 0 to 38588
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    38589 non-null  object
 1   target  38589 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 603.1+ KB


In [4]:
print(raw_df.shape)
raw_df.isnull().sum()

(38589, 2)


text      0
target    0
dtype: int64

### Clean text of stopwords (common used (english) words) and punctuation, fold to lowercase, lemmatize to stem

In [5]:
# simplifies position of speech tags
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))    
    # output of string.punctuation !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    punctuation = string.punctuation
    lemma = WordNetLemmatizer()
    
    # tokenize text into words, clean of punctuation and stopwords
    text = word_tokenize(text)
    text = [word.casefold() for word in text if word not in (punctuation or stop_words)]  
    
    # tokenize text including position of speech
    text = nltk.pos_tag(text)    
    text = list(map(lambda x: (x[0], pos_tagger(x[1])), text))
    
    # lemmatize words with respect to position of speech
    text_ = []    
    for word, tag in text:
        if tag is None:
            text_.append(word)
        else:
            text_.append(lemma.lemmatize(word, tag))
    text = ' '.join(text_) 
    return text   

In [6]:
# test = "- Indonesia will buy 11 Sukhoi fighters jets'  goes dogs revealed "
# test = preprocess_text(test)
# test

In [9]:
raw_df.text = raw_df.text.map(lambda text_: preprocess_text(text_))
raw_df.sample(3)

Unnamed: 0,text,target
18996,u.s. health secretary tom price ’ s use of pri...,0
13089,it s almost like our community agitator in chi...,1
8934,on tuesday s edition of the nightly show democ...,1


### Save nltk preprocessed dataframe

In [10]:
raw_df.to_csv('preprocessed_news_dataset.csv', index=False)
raw_df.head()

Unnamed: 0,text,target
0,donald trump just couldn t wish all americans ...,1
1,house intelligence committee chairman devin nu...,1
2,on friday it be reveal that former milwaukee s...,1
3,on christmas day donald trump announce that he...,1
4,pope francis use his annual christmas day mess...,1
