In [1]:
import re
import random

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("dark")

import warnings
warnings.filterwarnings("ignore")

from nltk.corpus import stopwords as sp
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

In [2]:
train=pd.read_csv("New_Train(Cmnt_len 20-600).csv")
train.head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131756 entries, 0 to 131755
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             131756 non-null  object
 1   comment_text   131756 non-null  object
 2   toxic          131756 non-null  int64 
 3   severe_toxic   131756 non-null  int64 
 4   obscene        131756 non-null  int64 
 5   threat         131756 non-null  int64 
 6   insult         131756 non-null  int64 
 7   identity_hate  131756 non-null  int64 
 8   clean          131756 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 9.0+ MB


## Feature Engineering 

In [4]:
def feature_engineering(df, sparse=0): 
    
    # Comment length
    df.insert(2, 'length', df.comment_text.apply(lambda x: len(x)))
    

    # Capitalization percentage
    def pct_caps(s):
        return sum([1 for c in s if c.isupper()]) / (sum(([1 for c in s if c.isalpha()])) + 1)    
    df.insert(3, 'caps', df.comment_text.apply(lambda x: pct_caps(x)))

    # Mean Word length 
    def word_length(s):
        s = s.split(' ')
        return np.mean([len(w) for w in s if w.isalpha()])
    df.insert(4, 'word_length', df.comment_text.apply(lambda x: word_length(x)))

    # Average number of exclamation points
    df.insert(5, 'exclamation', df.comment_text.apply(lambda s: len([c for c in s if c == '!'])))

    # Average number of question marks 
    df.insert(6, 'question', df.comment_text.apply(lambda s: len([c for c in s if c == '?'])))
    
    # Normalize
    for label in ['length', 'caps', 'word_length', 'question', 'exclamation']:
        minimum = df[label].min()
        diff = df[label].max() - minimum
        df[label] = df[label].apply(lambda x: (x-minimum) / (diff))

    # Strip IP Addresses
    ip = re.compile('(([2][5][0-5]\.)|([2][0-4][0-9]\.)|([0-1]?[0-9]?[0-9]\.)){3}'
                    +'(([2][5][0-5])|([2][0-4][0-9])|([0-1]?[0-9]?[0-9]))')
    def strip_ip(s, ip):
        try:
            found = ip.search(s)
            return s.replace(found.group(), ' ')
        except:
            return s

    df.comment_text = df.comment_text.apply(lambda x: strip_ip(x, ip))
    
    return df

In [5]:
# Features are added in the train and returned to train_features  

train_features = feature_engineering(train)
train_features.head(2)

Unnamed: 0,id,comment_text,length,caps,word_length,exclamation,question,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.420415,0.083521,0.04375,0.0,0.004785,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.157439,0.108351,0.065625,0.002309,0.0,0,0,0,0,0,0,1


<h2>Text Cleaning</h2>

- Preprocessing:
    - LowerCase
    - Removing html tags 
    - Removing URL's
    - Removing Punctuations
    - Performing stemming
    - Removing Stopwords
    - Expanding contractions etc.

In [6]:
STOP_WORDS = sp.words("english")

def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'")\
                        .replace("won't", "will not").replace("'s", " own")\
                        .replace("’", "'").replace("cannot", "can not")\
                        .replace("n't", " not").replace("'s", " is")\
                        .replace("'ve", " have").replace("i'm", "i am")\
                        .replace("'re", " are").replace("%", " percent ")\
                        .replace("₹", " rupee ").replace("$", " dollar ")\
                        .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    
    # Remove URL's
    x = re.sub(r"https?://\S+|www\.\S+", "", x)  
    
    # Remove special special characters, including symbols, emojis,and other graphic characters:
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    x=emoji_pattern.sub(r'', x)
    
    porter = PorterStemmer()
    pattern = re.compile('\W')
    
    if type(x) == type(''):
        x = re.sub(pattern, ' ', x)
    
    
    if type(x) == type(''):
        x = porter.stem(x)
        example1 = BeautifulSoup(x)
        x = example1.get_text()
                  
    return x

In [7]:
# Storing the Cleaned Comment Text in Processed_Comment

Processed_Comment = train["comment_text"].apply(lambda x:preprocess(x))

In [8]:
# Replacing comment_text with Cleaned_Comment

df = train.drop(["comment_text"],axis=1)
df.insert(1, "Cleaned_Comment", Processed_Comment)
df["Cleaned_Comment"].fillna("unknown",inplace=True)
df.head(2)

Unnamed: 0,id,Cleaned_Comment,length,caps,word_length,exclamation,question,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean
0,0000997932d777bf,explanation why the edits made under my userna...,0.420415,0.083521,0.04375,0.0,0.004785,0,0,0,0,0,0,1
1,000103f0d9cfb60f,d aww he matches this background colour i am ...,0.157439,0.108351,0.065625,0.002309,0.0,0,0,0,0,0,0,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131756 entries, 0 to 131755
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               131756 non-null  object 
 1   Cleaned_Comment  131756 non-null  object 
 2   length           131756 non-null  float64
 3   caps             131756 non-null  float64
 4   word_length      131291 non-null  float64
 5   exclamation      131756 non-null  float64
 6   question         131756 non-null  float64
 7   toxic            131756 non-null  int64  
 8   severe_toxic     131756 non-null  int64  
 9   obscene          131756 non-null  int64  
 10  threat           131756 non-null  int64  
 11  insult           131756 non-null  int64  
 12  identity_hate    131756 non-null  int64  
 13  clean            131756 non-null  int64  
dtypes: float64(5), int64(7), object(2)
memory usage: 14.1+ MB


In [10]:
# Creating a new dataset "Train_Feature-Engineering_Cleaned-Cmnt.csv"
# containing Cleaned_Comment as well as extra Features

df.to_csv("Train_Feature-Engineering_Cleaned-Cmnt.csv",index=False)