**TSWA PRACTICAL 7 - Text Normalization**

In [None]:
!pip install nltk



In [None]:
import re
import string
import nltk

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.corpus import wordnet

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import contractions

In [None]:
def contraction_remover(text):
    expanded_words = []
    for word in text.split():
    # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_words)

In [None]:
def normalize_corpus(df):
    # Remove HTML tags
    df['Preprocess_Article'] = df['Article'].apply(lambda x: re.sub(r'<.*?>', '', x))

    # Convert to lowercase
    df['Preprocess_Article'] = df['Preprocess_Article'].str.lower()

    # Remove URLs
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: re.sub(r'http\S+|www\.\S+', '', x))

    # Remove email addresses
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: re.sub(r'\S+@\S+', '', x))

    # Remove phone numbers
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: re.sub(r'\d{10}', '', x))

    # Handle negation
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: re.sub(r'\bnot\b(\w+)', r'not_\1', x))

    # Remove special characters and punctuation
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    # Remove numeric tokens
    df['Preprocess_Article'] = df['Preprocess_Article'].apply(lambda x: re.sub(r'\b\d+\b', '', x))

    # Tokenization
    df['tokens'] = df['Preprocess_Article'].apply(word_tokenize)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

    # Stemming
    stemmer = PorterStemmer()
    df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word,get_wordnet_pos(word))for word in x])

     # Convert tokens into a single string
    df['Clean Article'] = df['tokens'].apply(lambda x: ' '.join(x))

    return df

In [None]:
# Helper function to map POS tag to WordNet POS tag
def get_wordnet_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
  return tag_dict.get(tag, wordnet.NOUN)

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
# Fetch data

data = fetch_20newsgroups(subset='all')
data = fetch_20newsgroups(subset='all', shuffle=True,remove=('headers', 'footers', 'quotes'))
data_labels_map = dict(enumerate(data.target_names))

In [None]:
# Create objects for each package
import numpy as np
#import text_normalizer as tn
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# building the dataframe for the data extracted from newgroups
# Create a corpus of newsgroup sentences and create the data frame
corpus=data.data
target_labels=data.target
target_names = [data_labels_map[label] for label in data.target]
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels,'Target Name': target_names})
print(data_df.shape)
data_df.head(10)

In [None]:
data_df.info()

In [None]:
normalize_corpus(data_df)

In [None]:
data_df.info()

In [None]:
# view sample data
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

In [None]:
# Remove any unwanted characters
data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df.info()

In [None]:
data_df = data_df.dropna().reset_index(drop=True)
data_df.info()

In [None]:
# Creating a csv file of the cleaned doucment so that it can be reused
data_df.to_csv('clean_newsgroups.csv', index=False)

In [None]:
# Data - training and testing
from sklearn.model_selection import train_test_split

In [None]:
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
np.array(data_df['Target Name']),test_size=0.33, random_state=42)

In [None]:
# Create the dictionary for train and test data
from collections import Counter

In [None]:
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

In [None]:
trd

In [None]:
tsd

In [None]:
(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],ascending=False))