In [2]:
# Import packages
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import os
import re
from collections import Counter
import random
import pickle
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Read Data 

In [3]:
data_path = "../input/"
df = pd.read_csv(data_path+"train.csv")
# check for missing data
df.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

In [None]:
# check for missing data
df.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

## Feature Engineering

In [None]:
# reference: https://towardsdatascience.com/how-i-improved-my-text-classification-model-with-feature-engineering-98fbe6c13ef3

def feature(df) :
    df['word_count'] = df['comment_text'].apply(lambda x : len(x.split()))
    df['char_count'] = df['comment_text'].apply(lambda x : len(x.replace(" ","")))
    df['word_density'] = df['word_count'] / (df['char_count'] + 1)
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['capitals_prop'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
    df['num_exclamation_marks'] =df['comment_text'].apply(lambda x: x.count('!'))
    df['num_question_marks'] = df['comment_text'].apply(lambda x: x.count('?'))
    df['num_punctuation'] = df['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    df['num_symbols'] = df['comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    df['num_unique_words'] = df['comment_text'].apply(lambda x: len(set(w for w in x.split())))
    df['prop_unique_words'] = df['num_unique_words'] / df['word_count']
    return df

In [None]:
# add in engineered features
df = feature(df)

## Text Preprocessing

In [6]:
tweet_tokens = [ '#FollowFriday',
                 '@France_Inte',
                 '@PKuchly57',
                 '@Milipol_Paris',
                 'for',
                 'being',
                 'top',
                 'engaged',
                 'members',
                 'in',
                 'my',
                 'community',
                 'this',
                 'week',
                 ':)']

In [7]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(tweet_tokens))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [14]:
" ".join(tweet_tokens)

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [17]:
print(process_text(" ".join(tweet_tokens), lemmatizer, stop_words))

followfriday franceinte milipolparis top engaged member community week


In [4]:
t = "You are gay or antisemmitian?"
print(process_text(t, lemmatizer, stop_words))

gay antisemmitian


In [5]:
str(process_text(t, lemmatizer, stop_words))

'gay antisemmitian'

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) #add in general english stop words

def process_text(text, lemmatizer, stop_words):
    '''
    This function performs text data preprocessing, including tokenizing the text, converting text to lower case, removing
    punctuation, removing digits, removing stop words, stemming the tokens, then converting the tokens back to strings.
    
    Args:
    ------
        text (string): the text data to be processed
    
    Returns:
    --------
        Returns processed text (string)
    '''
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens] #lower case
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens] # remove punctuation
    words = [word for word in stripped if word.isalpha()] # remove non-alphabetic tokens
    words = [w for w in words if not w in stop_words] #remove stopwords
    lemma = [lemmatizer.lemmatize(word) for word in words] #lemmatized 
    processed_text = ' '.join(lemma) #detokenized
    return processed_text

# df['processed_text'] = df['comment_text'].apply(lambda x: process_text(x))

In [None]:
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,word_count,char_count,word_density,total_length,capitals,capitals_prop,num_exclamation_marks,num_question_marks,num_punctuation,num_symbols,num_unique_words,prop_unique_words,processed_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,43,223,0.191964,264,17,0.064394,0,1,6,0,41,0.953488,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,17,95,0.177083,112,8,0.071429,1,0,5,0,17,1.0,daww match background colour seemingly stuck t...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,42,192,0.217617,233,4,0.017167,0,0,4,0,39,0.928571,hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,113,511,0.220703,622,11,0.017685,0,0,6,0,82,0.725664,ca nt make real suggestion improvement wondere...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,13,55,0.232143,67,2,0.029851,0,1,3,0,13,1.0,sir hero chance remember page
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,13,54,0.236364,65,1,0.015385,0,0,2,0,12,0.923077,congratulation well use tool well talk
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,8,37,0.210526,44,37,0.840909,0,0,0,0,8,1.0,cocksucker piss around work
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,20,95,0.208333,115,4,0.034783,0,0,3,0,20,1.0,vandalism matt shirvington article reverted pl...
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,83,390,0.212276,472,7,0.014831,0,1,9,0,70,0.843373,sorry word nonsense offensive anyway intending...
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,12,59,0.2,70,2,0.028571,0,0,0,0,12,1.0,alignment subject contrary dulithgow


In [None]:
# write to file
df.to_csv(dir+"cleaned_train")


## Sentiment, Polarity, Subjectivity

In [None]:
df = pd.read_csv(dir + "cleaned_train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,word_count,char_count,word_density,total_length,capitals,capitals_prop,num_exclamation_marks,num_question_marks,num_punctuation,num_symbols,num_unique_words,prop_unique_words,processed_text
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,43,223,0.191964,264,17,0.064394,0,1,6,0,41,0.953488,explanation edits made username hardcore metal...
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,17,95,0.177083,112,8,0.071429,1,0,5,0,17,1.0,daww match background colour seemingly stuck t...
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,42,192,0.217617,233,4,0.017167,0,0,4,0,39,0.928571,hey man really trying edit war guy constantly ...
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,113,511,0.220703,622,11,0.017685,0,0,6,0,82,0.725664,ca nt make real suggestion improvement wondere...
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,13,55,0.232143,67,2,0.029851,0,1,3,0,13,1.0,sir hero chance remember page


In [None]:
df = df[~df['processed_text'].isnull()]
df.drop('Unnamed: 0', 1, inplace= True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,word_count,char_count,word_density,total_length,capitals,capitals_prop,num_exclamation_marks,num_question_marks,num_punctuation,num_symbols,num_unique_words,prop_unique_words,processed_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,43,223,0.191964,264,17,0.064394,0,1,6,0,41,0.953488,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,17,95,0.177083,112,8,0.071429,1,0,5,0,17,1.0,daww match background colour seemingly stuck t...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,42,192,0.217617,233,4,0.017167,0,0,4,0,39,0.928571,hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,113,511,0.220703,622,11,0.017685,0,0,6,0,82,0.725664,ca nt make real suggestion improvement wondere...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,13,55,0.232143,67,2,0.029851,0,1,3,0,13,1.0,sir hero chance remember page


In [None]:
# polarity
from textblob import TextBlob
df.processed_text = df.processed_text.apply(lambda x: str(x))
df['textblob'] = df['processed_text'].apply(lambda text: TextBlob(text).sentiment)

In [None]:
df['polarity'] = df['textblob'].apply(lambda x: x[0])
df['subjectivity'] = df['textblob'].apply(lambda x: x[1])

In [None]:
df.drop('textblob', 1, inplace=True)

In [None]:
df.to_csv(dir+"cleaned_train.csv", index=False)