# Springboard--DSC Program

# Capstone Project 1 - Data Wrangling 
### by Ellen A. Savoye

The data collected is from a Kaggle competition, Jigsaw Unintended Bias in Toxicity Classification, via the Kaggle API. Of the 7 files in the zipped data, we will be focusing on the 'train' data. The original 'train' data is comprised of 45 columns containing information on toxicity and identity labels, comments, and metadata. 

### Import packages and data

In [28]:
# !pip install kaggle
# !pip install spacy
# !pip install spacymoji
# !pip install emot 

In [5]:
import pandas as pd
import numpy as np

# libraries for NLP
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata
import string
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

# libraries for getting and moving data
import os
from os import path
import shutil
from zipfile import ZipFile

from kaggle.api.kaggle_api_extended import KaggleApi

In [6]:
# necessary dependencies for text pre-processing

nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, #entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [7]:
# Set directories
src = "C:\\Users\\ellen\\Documents\\GitHub\\Data_Science_Career_Track\\Capstone_1\\Code\\"
dst = "C:\\Users\\ellen\\Documents\\GitHub\\Data_Science_Career_Track\\Capstone_1\\Data\\"

kaggle_comp_name = 'jigsaw-unintended-bias-in-toxicity-classification'
zipfile_name = kaggle_comp_name + '.zip'

csv_file = [i for i in os.listdir(dst) if i.startswith("train") and path.isfile(path.join(dst, i))]
zip_file = [i for i in os.listdir(dst) if i.startswith("jigsaw") and path.isfile(path.join(dst, i))]

if zip_file[0] != zipfile_name:
    #Import data from Kaggle API
    api = KaggleApi()
    api.authenticate()
    files = api.competition_download_files(kaggle_comp_name)
    
    # Move the jigsaw zip file to the Data folder
    files = [i for i in os.listdir(src) if i.startswith("jigsaw") and path.isfile(path.join(src, i))]
    for f in files:
        shutil.move(path.join(src, f), dst)
    
    # Check if Train data is already extracted
    if csv_file != 'train.csv':
        with ZipFile(dst + zipfile_name, 'r') as zipObj:
            # Extract all the contents of zip file in current directory
            print(zipObj.namelist())
            zipObj.extract('train.csv', path = dst)
else:
    print('Data is already downloaded')

Data is already downloaded


In [8]:
# Read in the train dataset
csv_filename = 'train.csv'
train_data = pd.read_csv(dst + csv_filename, low_memory=False)

# Output the number of rows
print("Total rows: {0}".format(len(train_data)))

# See which headers are available
print(list(train_data))

Total rows: 1804874
['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity', 'other_religion', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit', 'identity_annotator_count', 'toxicity_annotator_count']


The metadata from Civil Comments platform is contained in the following columns: created_date, publication_id, parent_id, article_id, rating, funny, wow, sad, likes, and disagree.

I will be keeping these fields in for further exploration and possible use.

In [9]:
#function for counting null records:
def num_missing(x):
    return sum(x.isnull())

#Applying per column:
print("Missing values per column:")
print(train_data.apply(num_missing, axis=0))

Missing values per column:
id                                           0
target                                       0
comment_text                                 0
severe_toxicity                              0
obscene                                      0
identity_attack                              0
insult                                       0
threat                                       0
asian                                  1399744
atheist                                1399744
bisexual                               1399744
black                                  1399744
buddhist                               1399744
christian                              1399744
female                                 1399744
heterosexual                           1399744
hindu                                  1399744
homosexual_gay_or_lesbian              1399744
intellectual_or_learning_disability    1399744
jewish                                 1399744
latino                           

In [10]:
# Find percent of missing values for each column instead of number of records

percent_missing = train_data.isnull().sum() * 100 / len(train_data)
print(round(percent_missing,1))

id                                      0.0
target                                  0.0
comment_text                            0.0
severe_toxicity                         0.0
obscene                                 0.0
identity_attack                         0.0
insult                                  0.0
threat                                  0.0
asian                                  77.6
atheist                                77.6
bisexual                               77.6
black                                  77.6
buddhist                               77.6
christian                              77.6
female                                 77.6
heterosexual                           77.6
hindu                                  77.6
homosexual_gay_or_lesbian              77.6
intellectual_or_learning_disability    77.6
jewish                                 77.6
latino                                 77.6
male                                   77.6
muslim                          

In [11]:
# Data type of each column

train_data.dtypes

id                                       int64
target                                 float64
comment_text                            object
severe_toxicity                        float64
obscene                                float64
identity_attack                        float64
insult                                 float64
threat                                 float64
asian                                  float64
atheist                                float64
bisexual                               float64
black                                  float64
buddhist                               float64
christian                              float64
female                                 float64
heterosexual                           float64
hindu                                  float64
homosexual_gay_or_lesbian              float64
intellectual_or_learning_disability    float64
jewish                                 float64
latino                                 float64
male         

In [12]:
train_data.rating.unique()

array(['rejected', 'approved'], dtype=object)

In [13]:
train_data.shape

(1804874, 45)

There are 45 columns in the train dataframe. Of those 45, only three columns, 'comment_text', 'created_date', and 'rating', are objects. The remaining 42 are either float64 or int64. These columns are non-categorical. 'Comment_text' is categorical containing the individual comments that we need to analyze. 'Created_date' contains the original date the comments were created. 'Rating' is a categorical containing two values: rejected or approved. 

In [14]:
train_data.comment_text.head(5)

0    This is so cool. It's like, 'would you want yo...
1    Thank you!! This would make my life a lot less...
2    This is such an urgent design problem; kudos t...
3    Is this something I'll be able to install on m...
4                 haha you guys are a bunch of losers.
Name: comment_text, dtype: object

In [15]:
# View unique records in a particular column

#sorted(train_data.target.unique())
train_data.target.unique()

array([0.        , 0.89361702, 0.66666667, ..., 0.87726476, 0.01116838,
       0.87008821])

Toxicity and identity labels range from 0.0-1.0. The value represents the fraction of raters who believed the label fit the comment. Toxicity labels do not have any missing values. According to the competition details, a subset of comments have been labeled with a variety of identity attributes that have been mentioned in the comment. As such, every identity label is missing ~78% of the values per column. The subset comprises approximately 22% of the data. 

Two examples of how labeling works are as follows:
Example 1: 
    - Comment: I'm a white woman in my late 60's and believe me, they are not too crazy about me either!!
    - Toxicity Labels: All 0.0
    - Identity Mention Labels: female: 1.0, white: 1.0 (all others 0.0)

Example 2: 
    - Comment: Continue to stand strong LGBT community. Yes, indeed, you'll overcome and you have.
    - Toxicity Labels: All 0.0
    - Identity Mention Labels: homosexual_gay_or_lesbian: 0.8, bisexual: 0.6, transgender: 0.3 (all others 0.0)


'Target' is the toxicity label. 'Severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', and 'sexual_explicit' are toxicity sub types. All toxicity labels can be converted to categorical variables by using >= 0.5 as a positive indicator (1). 

Aside from 'id', 'comment_text', 'identity_annotator_count' and 'toxicity_annotator_count', the same conversion can be applied to the remaining identity columns. 'Id' is a unique identifier for each comment but may not hold value to keep in the data frame. 'Identity_annotator_count' and 'toxicity_annotator_count' are metadata columns from Jigsaw and may not hold value either. However, I'm not removing them until I do my exploratory data analysis to determine if they offer valuable insights. 

'Comment_text' will need to be cleaned, vectorized, and eventually create a design matrix.  

In [16]:
# Checking for blank string records
np.where(train_data.applymap(lambda x: x == ''))

(array([], dtype=int64), array([], dtype=int64))

In [17]:
# Checking the range of numerical columns
print('Minimum value: ')
train_data.iloc[:,:].min()

Minimum value: 


id                                                                                 59848
target                                                                                 0
comment_text                           Canada is north of the USA border,  its colde...
severe_toxicity                                                                        0
obscene                                                                                0
identity_attack                                                                        0
insult                                                                                 0
threat                                                                                 0
asian                                                                                  0
atheist                                                                                0
bisexual                                                                               0
black                

In [18]:
print('Maximum value: ')
train_data.iloc[:,:].max()

Maximum value: 


id                                                           6334010
target                                                             1
comment_text                                         🤣gotta love it!
severe_toxicity                                                    1
obscene                                                            1
identity_attack                                                    1
insult                                                             1
threat                                                             1
asian                                                              1
atheist                                                            1
bisexual                                                           1
black                                                              1
buddhist                                                           1
christian                                                          1
female                            

what can be deleted from messages field?
-stop words, punctuation, emoji, 

lematization or stemming
output of wrangling - corpus can be part of a data frame with other cleaned up columns

- Why removing meta-data columns? There has to be a data-science driven reason ...
    1) can't delete columns from dataset without approval from owner of data
    2) just because I don't see a connection doesn't mean there is one
    3) how can use the information that we're given to create a potential feature that is a measure the level of intention - like or don't like

- Null values implies values that are missing. Is it possible to have values that are missing that are not null?
    1) cases where values are not null but can still be not good
 
- How would we check for data quality?
    1) max, min, range, content, etc.

- There seem to be many columns with about 80% of values that are missing. How do we deal with this?
    1) Could limit the data to the ~20% of the data with identifiers
    2) concentrate on comment_text and response variable initially?
    
    a) would it make sense to compute them myself?

- Do we know what columns are categorical and what columns are non-categorical? 
    1) I would say the identity labels are categorical
    2) target is the response
    3) what would comment_text be?
    non-categorical predominantly - continuous metric
    categorical - comment_text, refers to a class

- Think about ways of persisting the dataset as an output of the wrangling phase, which will be an input to the next phase storytelling). (Hint ... you can use something like Pickle, or perhaps generate a CSV file)
    1) pickle the data

emoji - encoded to a string as a token (possibility); do search on how to deal with emoji in NLP

packages - nltk, spacy

check if summation of identity labels across rows is equal to 1.0

# NLP Things

unicode remove, freq count is an option, every emoji maps to a unicode character and maybe can map it to a text entity. 


frequency stopwords, manual inspection
think about & determine - punc, stop words, conjunctions

In [19]:
# create comment field in all lower case

train_data['comment_lower'] = train_data['comment_text'].str.lower()

In [20]:
train_data['comment_lower'].head()

0    this is so cool. it's like, 'would you want yo...
1    thank you!! this would make my life a lot less...
2    this is such an urgent design problem; kudos t...
3    is this something i'll be able to install on m...
4                 haha you guys are a bunch of losers.
Name: comment_lower, dtype: object

Instead of removing emojis or emoticons, we'll convert them into words.

In [33]:
# function to convert emoji and emoticons to words

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text


In [None]:
# create new comment field with converted emoji and emoticon

train_data['comment_transform'] = train_data['comment_lower'].apply(convert_emoticons)
train_data['comment_transform'] = train_data['comment_lower'].apply(convert_emojis)

In [None]:
train_data['comment_transform'].max()

In [None]:
# Remove accented characters - convert to standardized ASCII characters

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

train_data.loc[:,'comment_text_cleaned'] = train_data.loc[:,'comment_text'].apply(lambda x: remove_accented_chars(x))

In [None]:
# removing contractions 

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

train_data.loc[:,'comment_text_cleaned'] = train_data.loc[:,'comment_text_cleaned'].apply(lambda x: expand_contractions(x))

In [None]:
print(train_data.loc[:,'comment_text_cleaned'])

In [None]:
# Checking for blank string records
np.where(train_data.applymap(lambda x: x == ''))

In [None]:
# change characters to lower case - DO LAST

train_data.loc[:,'comment_text_cleaned'] = train_data.loc[:,'comment_text_cleaned'].apply(lambda x: x.lower())