### Text Classfication using CNN

In [1]:
import numpy as np
import pandas as pd
import re
import os
import datetime
import string
import  nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Dropout,Input, Flatten, Embedding
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Model
from sklearn.metrics import roc_auc_score, f1_score
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


Using TensorFlow backend.


#### Preprocessing

In [2]:
def process_email(text_list):
    # .* Zero or more characters of any type. 
    em = [] # for each doc
    preprocess_email = ""  # for each doc
    # temp=[]
    #extracting email
    emails_list = [email for email in re.findall(r'[\w\-\.]+@[\w\.-]+\b', text_list)]
    for email in emails_list:
        temp=[] #extracting email 
        str = ""
        str += email.split('@')[1]      #taking text after @
        temp = str.split('.')     #a list containing words split by "."
        if 'com' in temp:
            temp.remove('com')
            
        for word in temp:            #removing words less than 3
            if len(word)>2:
                em.append(word)
                
    for word in set(em):            #joining all the words in a string
        preprocess_email+=word
        preprocess_email+=' '
    #removing the email
    text_list =  re.sub(r'[\w\-\.]+@[\w\.-]+\b',' ', text_list)
    return preprocess_email, text_list

In [3]:
#extracting subject
def process_subject(text_list):
    temp1 = re.findall(r'^Subject.*$',text_list, re.MULTILINE)
    sub = temp1[0]
    sub = sub[7:]   #Truncate Subject
    for i in string.punctuation:   #remove all the non-alphanumeric
        sub = sub.replace(i, " ")
        sub = re.sub(r"re","", sub, flags=re.IGNORECASE) #removing Re
        sub = re.sub(r".*:","", sub, flags=re.IGNORECASE) #removing Re
        sub = re.sub(r"\s+"," ", sub, flags=re.IGNORECASE) #removing Re
        # sub = sub.lower()  #lower-casing
    listy = re.sub(r'Subject.*$'," ", text_list, flags=re.MULTILINE)
    listy = re.sub(r"Write to:.*$"," ", listy, flags=re.MULTILINE)           
    listy = re.sub(r"From:.*$"," ", listy, flags=re.MULTILINE)               
    listy = re.sub(r"or:", " ", listy,flags=re.MULTILINE)
    listy = re.sub(r"\s+"," ", listy, flags=re.IGNORECASE) #removing Re
    
    return sub, listy

In [4]:
def chunking(text_file):
    chunks=[]
    chunks=(list(ne_chunk(pos_tag(word_tokenize(text_file)))))
   
    for i in chunks:
        if type(i)==nltk.tree.Tree:
            if i.label() == "GPE":
                j = i.leaves()
                if len(j)>1:   #if new_delhi or bigger name
                    gpe = "_".join([term for term, pos in j])
                    text_file = re.sub(rf'{j[1][0]}',gpe, text_file, flags=re.MULTILINE)              #replacing delhi with new_delhi
                    text_file = re.sub(rf'\b{j[0][0]}\b',"",text_file, flags=re.MULTILINE)       #deleting new, \b is important
            if i.label()=="PERSON":           # deleting Ramesh         
                for term, pog in i.leaves():
                    text_file = re.sub(re.escape(term),"",text_file, flags=re.MULTILINE)
    return text_file

In [5]:
def process_text(text_list):
 
    # delete brackets
    text_list = re.sub(r"\(.*?\)","",text_list, flags=re.MULTILINE)            
    text_list = re.sub(r"<.*?>","", text_list, flags=re.MULTILINE) 
    # remove "\n", "\t", "-", "\"
    text_list = re.sub(r"\w+:","", text_list, flags=re.MULTILINE) #removing Ree
    text_list = re.sub(r"[\n\t\-\\\/]"," ", text_list, flags=re.MULTILINE)
    text_list = re.sub(r'[~^0-9$]',"", text_list, flags=re.MULTILINE)
    # decontration (from donor choose assignment)
    # specific
    text_list = re.sub(r"won't", "will not", text_list)
    text_list = re.sub(r"can\'t", "can not", text_list)
    # general
    text_list = re.sub(r"n\'t", " not", text_list)
    text_list = re.sub(r"\'re", " are", text_list)
    text_list = re.sub(r"\'s", " is", text_list)
    text_list = re.sub(r"\'d", " would", text_list)
    text_list = re.sub(r"\'ll", " will", text_list)
    text_list = re.sub(r"\'t", " not", text_list)
    text_list = re.sub(r"\'ve", " have", text_list)
    text_list = re.sub(r"\'m", " am", text_list)
    
    text_list = re.sub(r"\b[a-zA-Z]{1,2}_([a-zA-Z]+)",r"\1",text_list) #d_berlin to berlin

    text_list = re.sub(r"\b_([a-zA-z]+)_\b",r"\1",text_list) #replace _word_ to word
    text_list = re.sub(r"\b_([a-zA-z]+)\b",r"\1",text_list) #replace_word to word
    text_list = re.sub(r"\b([a-zA-z]+)_\b",r"\1",text_list) #replace word_ to word

    text_list =  chunking(text_list)
    
    text_list = re.sub(r'\b\w{1,2}\b'," ",text_list) #remove words <2
    text_list = re.sub(r"\b\w{15,}\b"," ",text_list) #remove words >15
    text_list = re.sub(r"[^a-zA-Z_]"," ",text_list)  #keep only alphabets and _                                       
    text_list = re.sub(r" {2,}", " ", text_list, flags=re.MULTILINE) # REMOVE THE EXTRA SPACES

    text_list = text_list.lower()
    return text_list


In [6]:
def preprocess(Input_Text):
  processed_text = []
  processed_subject = []
  processed_email = []
  for i in range(len(Input_Text)):
    p_email, Input_Text[i] = process_email(Input_Text[i])
    p_subject, Input_Text[i] = process_subject(Input_Text[i])
    p_Text = process_text(Input_Text[i])
    processed_email.append(p_email)
    processed_subject.append(p_subject)
    processed_text.append(p_Text)

  data['processed_email'] = processed_email
  data['processed_subject'] = processed_subject
  data['processed_text'] = processed_text



In [7]:
allLines = []
labels = []
filenum = []
path = './documents/documents/'
text_from_files = []
fileList = os.listdir(path)
for filename in fileList:
    text_in_each_files= []
    clas, num = filename.split('_')
    labels.append(clas)
    filenum.append(num.split('.')[0])
    temp_file = open(os.path.join('./documents/documents/'+ filename), 'r+',
     encoding="utf8", errors='ignore')
    text_from_files.append(temp_file.read())

list_tuples = list(zip(labels, filenum))      
data = pd.DataFrame(list_tuples, columns=['Label', 'File_num'])
data.head()
 
num_classes = len(set(list(labels)))  
num_classes

20

In [8]:
preprocess(text_from_files)
data.head()

Unnamed: 0,Label,File_num,processed_email,processed_subject,processed_text
0,alt.atheism,49960,netcom mantis,Atheist sources,archive atheism resources alt atheism archive...
1,alt.atheism,51060,mantis,Introduction to Atheism,rchive atheism introduction atheism archive i...
2,alt.atheism,51119,edu mimsy umd dbstu1 tu-bs,Gospel Dating,article well has quite different not necessar...
3,alt.atheism,51120,unh edu kepler mantis,university violating separation of church state,recently ras have been ordered post religious...
4,alt.atheism,51121,Ibm org harder ccr-p Com Watson ibm watson ida,soc motss et al Princeton axes matching funds...,article however hate economic terrorism and p...


In [9]:
en = LabelEncoder()
df = pd.DataFrame(data.iloc[:, 2:5], en.fit_transform(data.iloc[:,0]), columns=['processed_email', 'processed_subject', 'processed_text', 'label']).stack().reset_index()
df.rename(columns = {'level_0':'label', 'level_1':'column_name_stacked', 0:'column_data_stacked'}, inplace = True)
df.head()

Unnamed: 0,label,column_name_stacked,column_data_stacked
0,0,processed_email,netcom mantis
1,0,processed_subject,Atheist sources
2,0,processed_text,archive atheism resources alt atheism archive...
3,0,processed_email,netcom mantis
4,0,processed_subject,Atheist sources


In [12]:
df.to_csv('CNN_DOC_CLASSIFICATION_stacked_data.csv')


#### REFERENCE
https://towardsdatascience.com/how-i-preprocessed-text-data-using-regular-expressions-for-my-text-classification-task-cnn-cb206e7274ed

https://stackoverflow.com/questions/43151775/how-to-have-parallel-convolutional-layers-in-keras/

http://ai.intelligentonlinetools.com/ml/document-classification-using-convolutional-neural-network/

https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

https://stackoverflow.com/questions/71357014/running-a-fine-tune-model-for-my-cnn-value-error