# Data Cleaning
 This file has all the required functions to clean text data

In [1]:
import pandas as pd
import numpy as np
import re
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype
import copy
import spacy

In [2]:
nlp=spacy.load('en_core_web_sm')
# import spacy 
# spacy.load('en_core_web_sm')

In [3]:
def removeWhiteSpace(data):
    data.columns= data.columns.str.strip()
    for col in data.columns:
        if(is_string_dtype(data[col])):
            data[col]=data[col].str.strip()
    return data
    

In [17]:
def preProcessColumn(data,fillna=True,removeNumbers=True,
                     removeSpecialCharacters=True,handleCamelCase=False,
                    removeMailids=True,removeUrls=True):
    if(fillna):
        data=data.fillna("No text")
    data = data.astype(str)
    #splitting data based on camel case
    if(handleCamelCase):
        data = data.map(lambda x:camelCaseSplit(x))
        
    #remove extra spaces
    data= data.map(lambda x:re.sub(' +',' ',x))
    
    if(removeMailids):
        data=data.map(lambda x:re.sub('[\w\.-]+@[\w\.-]+',' ',x))
    if(removeUrls):
        data=data.map(lambda x:re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',x))
        data=data.map(lambda x:re.sub('\s*(www\.[^:\/\n]+\.com)\s*',' ',x))
        data=data.map(lambda x:re.sub('[<^>}*>]'," ",x))
    
    #replace _ with " "
    data= data.map(lambda x:x.replace('_',' '))
    
    #remove numbers
    if(removeNumbers):
        data= data.map(lambda x:re.sub('\d','',x))
    #remove special chars
    if(removeSpecialCharacters):
        data= data.map(lambda x:re.sub('[^a-zA-Zr"\'\""\d""]+',' ',x))
        #replace ' with ''
        data= data.replace("'",'',regex=True)
        
    
    return(data)
        

In [5]:
import pandas as pd

In [6]:
def camelCaseSplit(identifier):
    matches = re.finditer('.+?)?:(?<=[a-z])(?=[A-Z]) | (?<=[A-Z])(?=[A-Z][a-z])|$)',identifier)
    splitStringList= [m.grop(0) for m in matches]
    
    return(' '. join(i for i in splitStringList))

In [7]:
def preProcessData(data,textColumns,customStopWords,fillna=True,
                   removeNumbers=True,removeSpecialCharacters=True,
                   handleCamelCase=False,removeStopwords=True,
                  removeMailids=True,removeUrls=True):
    data= removeWhiteSpace(data)
    for cols in data.columns:
        if(cols in textColumns):
            data[cols]=preProcessColumn(data[cols],fillna=fillna,removeNumbers=removeNumbers,removeSpecialCharacters=removeSpecialCharacters,handleCamelCase=handleCamelCase,
                                       removeMailids=removeMailids,removeUrls=removeUrls)
            if(removeStopwords):
                data[cols]=removeStopwordsFunc(data[cols],customStopWords=customStopWords)
        
            
    return(data)

In [8]:
## Stop word removal
import nltk
from nltk.corpus import stopwords

In [9]:
from nltk.tokenize import word_tokenize

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/dheekshitha-
[nltk_data]     vibha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:

def removeStopwordsFunc(data,customStopWords=["a"]):
    stop_words = set(stopwords.words('english')) 
    stop_words.update(customStopWords)

    data = data.map(lambda x:word_tokenize(x) )

#     filtered_sentence = [w for w in word_tokens if not w in stop_words] 

#     filtered_sentence = [] 

#     for w in word_tokens: 
#         if w not in stop_words: 
#             filtered_sentence.append(w) 
    data = data.map(lambda a:(" ".join(i for i in a  if i.lower() not in stop_words)))
    
    return(data)

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/dheekshitha-
[nltk_data]     vibha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
#lemmatize the data
from nltk.stem import WordNetLemmatizer 
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer() 
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [14]:
lemmatize_text("productions")

['production']

In [15]:
# import re

In [16]:
# x="this @bsi stuoud@gmaf, https://www.ccs-labs.org/teaching/rn/animations/propagation/"
# re.sub('[\w\.-]+@[\w\.-]+','',x)
# re.sub('[<^>}*>]',"",x)
# re.sub('[^a-zA-Z.]'," ",x)