In [1]:
import numpy as np
import pandas as pd
import re
from os.path import join
from tqdm import tqdm

In [2]:
file_embedding_txt = join('E://DM//NLP//WordVec', 'glove.840B.300d.txt')
file_embedding_pickle = join('E://DM//NLP//TMP_MEMORY', 'glove.840B.300d.pickle')
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
X_test = test["comment_text"].str.lower()
y_train = train[labels]

print('X_train: ' + str(X_train.shape))
print('y_train: ' + str(y_train.shape))
print('X_test: ' + str(X_test.shape))
print('y_train: ' + str(list(y_train.columns)))

X_train: (159571,)
y_train: (159571, 6)
X_test: (153164,)
y_train: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [3]:
def ReadWord2Vec():
    embeddings_index = {}
    with open(file_embedding_txt, encoding='utf8') as f:
        for line in tqdm(f):
            values = line.rstrip().rsplit(' ')
            assert len(values)>=300
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index
embeddings_index = ReadWord2Vec()

2196018it [04:51, 7529.72it/s]


In [4]:
# merge is the whole copus merged from training text and test text
merge = pd.concat([X_train, X_test]).reset_index(drop=True)
print(merge[:5])
print(type(merge))
print(len(merge))
for i in range(5):
    print(merge[i])

0    explanation\nwhy the edits made under my usern...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    "\nmore\ni can't make any real suggestions on ...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object
<class 'pandas.core.series.Series'>
312735
explanation
why the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27
d'aww! he matches this background colour i'm seemingly stuck with. thanks.  (talk) 21:51, january 11, 2016 (utc)
hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.
"
more
i can't make any real suggest

#### get rid of '\t \n \r'

In [5]:
intab = '\n\t\r'
outtab = ''.join([' ' for number in range(len(intab))])
trans_dict = str.maketrans(intab, outtab)
merge = merge.apply(lambda text:text.translate(trans_dict))
print(merge[:5])

0    explanation why the edits made under my userna...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    " more i can't make any real suggestions on im...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object


#### Get rid of spam

In [6]:
spam_example = 'dloljewsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloljewsloldongsloldongsloldongsloldongsloldongsloldongsloldongs'
spam_mask = merge.str.contains(spam_example, regex=False)
spam_example_id = merge[spam_mask].index[0]
print('###########################')
print(merge[spam_example_id])

###########################
"    == tip for trolls. ==    1: don't fuck with me.   2: you are gay and will always be gay.    yhbt. yhl.hand.:dloldongsloljewsloldongsyhbt. yhl.hand.:dloljewsyhbt. yhl.hand.:dlolfagsloljewswoploldongsyhbt. yhl.hand.:dloldongsloldongsloldongsloldongsyhbt. yhl.hand.:dloljewsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloldongsloljewsloldongsloldongsloldongsloldongsloldongsloldongsloldongs   jesuitx. did.wtc.:d"


In [7]:
# if the length of a text between two spaces is over MAX_LENGTH, consider it as a spam and remove it
MAX_LENGTH_SPAM = 30
RemoveSPAM = lambda text: ' '.join([word if len(word)<MAX_LENGTH_SPAM else ' ' for word in text.split(' ')])
merge = merge.apply(RemoveSPAM)
print(merge[spam_example_id])


"    == tip for trolls. ==    1: don't fuck with me.   2: you are gay and will always be gay.    yhbt.   yhl.hand.:dloljewsyhbt.         jesuitx. did.wtc.:d"


#### Text Normalization
* replace full-width characters with half-width characters. for example, convert 'ｄａｙ' to 'day'. convert '１２３' to '123'

```python
import unicodedata
foo = u'１２３４５６７８９０'
unicodedata.normalize('NFKC', foo)
```

In [8]:
import unicodedata
Normalize = lambda text:unicodedata.normalize('NFKC', text)
merge = merge.apply(Normalize)
print(merge[:5])

0    explanation why the edits made under my userna...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    " more i can't make any real suggestions on im...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object


#### remove IP address

In [9]:
pattern_ip = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
RemoveIP = lambda text: re.sub(pattern_ip, ' ip ', text)
merge = merge.apply(RemoveIP)
print(merge[:5])

0    explanation why the edits made under my userna...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    " more i can't make any real suggestions on im...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object


#### how to deal with hyperLink

* one reason that hyperlinks should not be removed is that, the words in the links may have semantic meanings
* another reason is that a regular expression covers all posible situations is too complecated, see [this link](https://stackoverflow.com/questions/161738/what-is-the-best-regular-expression-to-check-if-a-string-is-a-valid-url)

#### remove unwanted unicode alphabet

In [10]:
import regex

RemoveNonLatin_a = lambda text: regex.sub(r'[^\p{Latin}\p{Punctuation}\p{Number}\p{Separator}]', u'', text) 
RemoveNonLatin_b = lambda text: regex.sub(r'[^\p{InBasic_Latin}\p{InLatin-1_Supplement}\p{Punctuation}\p{Number}\p{Separator}]', u'', text) 
RemoveNonLatin = lambda text: RemoveNonLatin_a(RemoveNonLatin_b(text))
merge = merge.apply(RemoveNonLatin)
print(merge[:5])

0    explanation why the edits made under my userna...
1    d'aww! he matches this background colour i'm s...
2    hey man, i'm really not trying to edit war. it...
3    " more i can't make any real suggestions on im...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object


##### this implementation also works well with hyper link urls
* todo: is that meaningful to explicitly add a hyper link token after a hyper link, for example: 'www.google.com ###hyperlink###'

#### get rid of apostrophe

In [11]:
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll": "it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}
def ConvertText(text):
    result = [APPO[word] if word in APPO else word for word in text.split(' ')]
    return ' '.join(result)
merge = merge.apply(ConvertText)
print(merge[:5])

0    explanation why the edits made under my userna...
1    d'aww! he matches this background colour i am ...
2    hey man, i am really not trying to edit war. i...
3    " more i cannot make any real suggestions on i...
4    you, sir, are my hero. any chance you remember...
Name: comment_text, dtype: object


#### get rid of punctuations

In [12]:
RemovePunctuations = lambda text: regex.sub(r'[\p{Punctuation}]', u' ', text) 
merge = merge.apply(RemovePunctuations)
print(merge[:5])

0    explanation why the edits made under my userna...
1    d aww  he matches this background colour i am ...
2    hey man  i am really not trying to edit war  i...
3      more i cannot make any real suggestions on i...
4    you  sir  are my hero  any chance you remember...
Name: comment_text, dtype: object


####  数据集的文本是不是全部在embeddings_index中，如果不全是，哪些不再embeddings_index中的文本是什么，长什么样?

In [13]:
def ComputeSetOfWords(pd_series):
    sets = pd_series.apply(lambda text: set(text.split(' ')))
    result = set([])
    result = result.union(*sets)
    return result
word_set = ComputeSetOfWords(merge)
vector_set = set(embeddings_index.keys())
print('number of words in the doc {}'.format(len(word_set)))
print('number of words in the word2vec file {}'.format(len(vector_set)))

number of words in the doc 299082
number of words in the word2vec file 2196017


#### 一般认为word2vec中的词汇是比较全的，如果在训练数据集中的某个词没有出现在word2vec的词汇中，那么分析下这个词到底是什么会比较有意义

In [14]:
diff = word_set.difference(vector_set)
print(len(diff))
show = list(diff)
if len(show)>800: show=show[:800]
print(show)

153537


['', 'gunbirdriver', 'vandd', 'odetts', 'rabinal', 'reallavergne', 'yearssince', 'sozialisticheskuyu', 'resnik', 'rockdiedout', 'docuemnts', 'ameriturd', '5000korean', '١٩٩٣', 'registrirajo', 'pooopieee', 'politikospeak', 'nickct', 'poogwalah', 'reeived', 'upnot', 'obozrenie', 'notochords', 'northumberlandshire', 'disctracting', 'qiyamah', 'dekodiranje', 'doputaju', 'ostalocutanje', 'metalsucks', 'worlview', 'konotacje', 'starfire777', 'tueruwfyurf', 'unblockdude', 'cammatti', 'childlish', 'demmenie', 'nüfusu', 'samavedic', 'asgher', 'tål', 'combattive', 'borralo', 'fu9k', 'consideirng', 'channabasappa', 'palono', 'gardenian', 'itsme1234', 'litthe', 'fraunce', 'furme', 'duplom', 'zeitschrift', 'yeahhhhhhhhhhhh', 'fdzvdsf', 'permes', 'faiththat', 'kressenstein', 'staatsanwaltschaft', 'gramostea', 'musephil2006', 'nacionalnoj', 'algonquin7', 'butterpig', 'auckjg', 'choujuu', 'penpad', 'accordigly', 'nornalup', 'helkar', 'senatvs', 'discusseddirect', 'yahawah', 'noev', 'boulez', 'traugott

#### other text preprocessing
it would be better if the following transformation can be performed:
- 'pic005' ->  'pic 005'.
- 'seba1978' ->  'seba 1978'
- '23help' ->  '23 help'

In [15]:
'''
SeperateWordNumber = lambda text: regex.sub(r'([\p{Latin}]+)([\p{Number}]+)', r'\g<1> \g<2>', text)
SeperateNumberWord = lambda text: regex.sub(r'([\p{Number}]+)([\p{Latin}]+)', r'\g<1> \g<2>', text)
merge = merge.apply(SeperateWordNumber)
merge = merge.apply(SeperateNumberWord)
print(merge[:5])
'''

"\nSeperateWordNumber = lambda text: regex.sub(r'([\\p{Latin}]+)([\\p{Number}]+)', r'\\g<1> \\g<2>', text)\nSeperateNumberWord = lambda text: regex.sub(r'([\\p{Number}]+)([\\p{Latin}]+)', r'\\g<1> \\g<2>', text)\nmerge = merge.apply(SeperateWordNumber)\nmerge = merge.apply(SeperateNumberWord)\nprint(merge[:5])\n"

#### get rid of stop words

In [16]:
from nltk.corpus import stopwords
eng_stopwords = stopwords.words("english")
eng_stopwords = [word for word in eng_stopwords if len(word)>1]
eng_stopwords = set(eng_stopwords)
RemoveStopWords = lambda text: ' '.join([word if word not in eng_stopwords else ' ' for word in text.split(' ')])
merge = merge.apply(RemoveStopWords)
print(merge[:5])

0    explanation     edits made     username hardco...
1    d aww    matches   background colour i   seemi...
2    hey man  i   really   trying   edit war       ...
3        i cannot make   real suggestions   improve...
4       sir      hero    chance   remember   page  ...
Name: comment_text, dtype: object


#### get rid of  long words

In [17]:
MAX_LENGH = 20
RemoveLongWords = lambda text: ' '.join([word if len(word)<=MAX_LENGH else ' ' for word in text.split(' ')]) 
merge = merge.apply(RemoveLongWords)
print(merge[:5])

0    explanation     edits made     username hardco...
1    d aww    matches   background colour i   seemi...
2    hey man  i   really   trying   edit war       ...
3        i cannot make   real suggestions   improve...
4       sir      hero    chance   remember   page  ...
Name: comment_text, dtype: object


#### get rid of numbers

In [18]:
RemoveNumber = lambda text: ' '.join([word if not word.isdigit() else ' ' for word in text.split(' ')])
merge = merge.apply(RemoveNumber)
print(merge[:5])

0    explanation     edits made     username hardco...
1    d aww    matches   background colour i   seemi...
2    hey man  i   really   trying   edit war       ...
3        i cannot make   real suggestions   improve...
4       sir      hero    chance   remember   page  ...
Name: comment_text, dtype: object


In [19]:
word_set = ComputeSetOfWords(merge)
diff = word_set.difference(vector_set)
print(len(diff))
show = list(diff)
if len(show)>800: show=show[:800]
print(show)

150691


['', 'gunbirdriver', 'vandd', 'odetts', 'rabinal', 'reallavergne', 'yearssince', 'sozialisticheskuyu', 'resnik', 'rockdiedout', 'docuemnts', 'ameriturd', '5000korean', 'registrirajo', 'pooopieee', 'politikospeak', 'nickct', 'poogwalah', 'reeived', 'upnot', 'obozrenie', 'notochords', 'northumberlandshire', 'disctracting', 'qiyamah', 'dekodiranje', 'doputaju', 'ostalocutanje', 'metalsucks', 'worlview', 'konotacje', 'starfire777', 'tueruwfyurf', 'unblockdude', 'cammatti', 'childlish', 'demmenie', 'nüfusu', 'samavedic', 'asgher', 'tål', 'combattive', 'borralo', 'fu9k', 'consideirng', 'channabasappa', 'palono', 'gardenian', 'itsme1234', 'litthe', 'fraunce', 'furme', 'duplom', 'zeitschrift', 'yeahhhhhhhhhhhh', 'fdzvdsf', 'permes', 'faiththat', 'kressenstein', 'staatsanwaltschaft', 'gramostea', 'musephil2006', 'nacionalnoj', 'algonquin7', 'butterpig', 'auckjg', 'choujuu', 'penpad', 'accordigly', 'nornalup', 'helkar', 'senatvs', 'discusseddirect', 'yahawah', 'noev', 'boulez', 'traugott', 'saty

#### combine whitespaces
strange chars were replaced with whitespaces in the previous steps. we need to combine multiple whitespaces into one whitespace. for example 'The   fox jumped   over    the log' -> 'The fox jumped over the log'

In [20]:
RemoveMultipleSpace = lambda text: ' '.join(text.split())
merge = merge.apply(RemoveMultipleSpace)
print(merge[:5])

0    explanation edits made username hardcore metal...
1    d aww matches background colour i seemingly st...
2    hey man i really trying edit war guy constantl...
3    i cannot make real suggestions improvement i w...
4                        sir hero chance remember page
Name: comment_text, dtype: object


#### 验证数据中的字符集

In [21]:
def ComputeSetOfAlphabets(pd_series):
    sets = pd_series.apply(lambda text: set(str(text)))
    result = set([])
    result = result.union(*sets)
    return result

def ShowUnicodeCodePoint(character):
    print(character)
    print(repr(character))
    print(ord(character))
    
alphabet_set = ComputeSetOfAlphabets(merge)
alphabet = list(alphabet_set)
alphabet.sort()
# print(alphabet)
ShowUnicodeCodePoint(alphabet[-1])


〇
'〇'
12295


#### Save

In [22]:
train_clean = train.copy()
train_clean["comment_text"] = merge[:len(train)]
test_clean = test.copy()
test_clean["comment_text"] = merge[len(train):].reset_index(drop=True, inplace=False)

In [23]:
root = 'E:\\DM\\NLP\\TMP_MEMORY'
path_train_clean = join(root, 'train_clean.csv')
path_test_clean = join(root, 'test_clean.csv')
train_clean.to_csv(path_train_clean, encoding='utf-8', index=False, header=True)
test_clean.to_csv(path_test_clean, encoding='utf-8', index=False, header=True)

#### Sum Up

In [24]:
FUNCTIONS = []
NAMES = []

# lower case first
FUNCTIONS.append(lambda text:text.lower())
NAMES.append('lower case first')

# get rid of '\t \n \r'
intab = '\n\t\r'
outtab = ''.join([' ' for number in range(len(intab))])
trans_dict = str.maketrans(intab, outtab)
FUNCTIONS.append(lambda text:text.translate(trans_dict))
NAMES.append('get rid of \\t \\n \\r')

# Get rid of spam
MAX_LENGTH_SPAM = 30
FUNCTIONS.append(lambda text: ' '.join([word if len(word)<MAX_LENGTH_SPAM else ' ' for word in text.split(' ')]))
NAMES.append('Get rid of spam')

# Text Normalization
import unicodedata
FUNCTIONS.append(lambda text:unicodedata.normalize('NFKC', text))
NAMES.append('Text Normalization')

# remove IP address
pattern_ip = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
FUNCTIONS.append(lambda text: re.sub(pattern_ip, ' ip ', text))
NAMES.append('remove IP address')

# remove unwanted unicode alphabet
import regex
RemoveNonLatin_a = lambda text: regex.sub(r'[^\p{Latin}\p{Punctuation}\p{Number}\p{Separator}]', u'', text) 
RemoveNonLatin_b = lambda text: regex.sub(r'[^\p{InBasic_Latin}\p{InLatin-1_Supplement}\p{Punctuation}\p{Number}\p{Separator}]', u'', text) 
FUNCTIONS.append(lambda text: RemoveNonLatin_a(RemoveNonLatin_b(text)))
NAMES.append('remove unwanted unicode alphabet')

# get rid of apostrophe
def ConvertText(text):
    result = [APPO[word] if word in APPO else word for word in text.split(' ')]
    return ' '.join(result)
FUNCTIONS.append(ConvertText)
NAMES.append('get rid of apostrophe')

# get rid of punctuations
FUNCTIONS.append(lambda text: regex.sub(r'[\p{Punctuation}]', u' ', text) )
NAMES.append('get rid of punctuations')

# get rid of stop words
from nltk.corpus import stopwords
eng_stopwords = stopwords.words("english")
eng_stopwords = [word for word in eng_stopwords if len(word)>1]
eng_stopwords = set(eng_stopwords)
FUNCTIONS.append(lambda text: ' '.join([word if word not in eng_stopwords else ' ' for word in text.split(' ')]))
NAMES.append('get rid of stop words')

# get rid of long words
MAX_LENGH = 20
FUNCTIONS.append(lambda text: ' '.join([word if len(word)<=MAX_LENGH else ' ' for word in text.split(' ')]))
NAMES.append('get rid of long words')

# get rid of numbers
FUNCTIONS.append(lambda text: ' '.join([word if not word.isdigit() else ' ' for word in text.split(' ')]))
NAMES.append('get rid of numbers')

# combine whitespaces
FUNCTIONS.append(lambda text: ' '.join(text.split()))
NAMES.append('combine whitespaces')


In [25]:
GetLength = lambda text: len(text.split(' '))
length = merge.apply(GetLength)
length.sort_values(inplace=True, ascending=False)
# length_distribute = length.value_counts(sort=True)
extream_examples = length[:200]
example_ids = []
for example in extream_examples:
    example_id = length[length==example].index[0]
    example_ids.append(example_id)
    # print('##############################################')
    # print(merge_origin[example_id])
    # print('**********************************************')
    # print(merge[example_id])

In [28]:
##16
##15
origin = pd.concat([X_train, X_test]).reset_index(drop=True)
text = origin[example_ids[195]][:150]
print(text)

pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit w

In [27]:
print(text)
for func, name in zip(FUNCTIONS, NAMES):
    print('####################')
    print(name)
    text = func(text)
    print(text)

pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
lower case first
pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
get rid of \t \n \r
pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
Get rid of spam
pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
Text Normalization
pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
remove IP address
pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
remove unwanted unicode alphabet
pelican shit willy on wheels pelican shit willy on wheels pelican shit willy on wheels pelican shit 
####################
get rid of apostrophe
p