# About the Data

* The data was taken from 
the csv file complaints.csv


* The complaints are for the products:<br>

  1. 'Bank account or service'
  2. 'Checking or savings account'
  3. 'Consumer Loan'
  4. 'Credit card or prepaid card'
  5. 'Credit reporting, credit repair services, or other personal consumer reports' 
  6. 'Debt collection'
  7. Money transfer/s, virtual currency, or money service'
  8. 'Mortgage'
  9. 'Payday loan, title loan, or personal loan'
  10. 'Student loan'
  11. 'Vehicle loan or lease'


* The data cleaning was done using spacy library


## Next Steps
* Fine-tune DistilBERT on the sample data which has atmost 10 percent masked words.

## Google Drive access

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# write the appropriate paths to retrieve the data and store results 
sample_data_path = '/content/drive/MyDrive/SAMPLE_21_APRIL_2022.csv'

# Loading the sample dataset

In [None]:
#Load the data
import pandas as pd
sample_df = pd.read_csv(sample_data_path)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54994 entries, 0 to 54993
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   consumer_complaint_narrative   54994 non-null  object 
 1   product                        54994 non-null  object 
 2   split_words_whitespaces        54994 non-null  object 
 3   number_of_words                54994 non-null  int64  
 4   number_of_charachters          54994 non-null  int64  
 5   charachters_by_words           54994 non-null  int64  
 6   number_of_unique_words         54994 non-null  int64  
 7   potenial_mask_words            54994 non-null  object 
 8   number_of_potenial_mask_words  54994 non-null  int64  
 9   potenial_mask_words_BY_words   54994 non-null  float64
dtypes: float64(1), int64(5), object(4)
memory usage: 4.2+ MB


In [None]:
import spacy
# use spacy with the dependency parse 
nlp = spacy.load("en_core_web_sm")

In [None]:
from tqdm import tqdm
# instantiate
tqdm.pandas()
sample_df['spacy_doc']= sample_df['consumer_complaint_narrative'].progress_apply(lambda x :list(nlp.pipe([x]))[0])
print("\n\nSpacy Doc Completed")

100%|██████████| 54994/54994 [33:29<00:00, 27.36it/s]



Spacy Doc Completed





In [None]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54994 entries, 0 to 54993
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   consumer_complaint_narrative   54994 non-null  object 
 1   product                        54994 non-null  object 
 2   split_words_whitespaces        54994 non-null  object 
 3   number_of_words                54994 non-null  int64  
 4   number_of_charachters          54994 non-null  int64  
 5   charachters_by_words           54994 non-null  int64  
 6   number_of_unique_words         54994 non-null  int64  
 7   potenial_mask_words            54994 non-null  object 
 8   number_of_potenial_mask_words  54994 non-null  int64  
 9   potenial_mask_words_BY_words   54994 non-null  float64
 10  spacy_doc                      54994 non-null  object 
dtypes: float64(1), int64(5), object(5)
memory usage: 4.6+ MB


In [None]:
#Check the of the first 'spacy_doc' record
type(sample_df.loc[0,'spacy_doc'])

spacy.tokens.doc.Doc

In [None]:
#Define the strings to mask
mask_words_list =['XX/XX/XXXX','XX-XX-XXXX', #DATE mm/dd/yyyy mm-dd-yyyy
                  'XXXX XXXX XXXX XXXX XXXX','XXXX-XXXX-XXXX-XXXX',#CREDIT or PREPASID CARD NUMBER
                  'XXXX XXXX XXXX XXXX','XXXX XXXX XXXX','XXXX-XXXX-XXXX','XXXX-XXXX','XXXX XXXX',
                  'XXX-XX-XXXX','XXX-XXX','XX-XXXX',
                  'XXXXXXXXXXXXXXXXXX','XXXXXXXXXXXXXXXXX', 'XXXXXXXXXXXXXXXX', 'XXXXXXXXXXXXXXX', 'XXXXXXXXXXXXXX',# BANK ACCOUNT NUMBER
                  'XXXXXXXXXXXXX', 'XXXXXXXXXXXX', 'XXXXXXXXXXX',                                                   # RANGES FROM 12 TO 18 DIGITS
                  'XXXXXXXXXX','XXXXXXXXX'          #ROUTING NUMBER IS 9 DIGIT
                  'XXXX','XXX','XX']

In [None]:
sample_df2 =sample_df.copy()
sample_df2.info()

In [None]:
import re
# Function to identify the tokens and named entities that have to be MASKED replace them with ' <MASK> '
def change_details(word):
    if word.like_email or word.like_url:
        return '<MASK>'
    elif any(mask_word in word.text for mask_word in mask_words_list):
        return '<MASK>'
    elif word.is_stop:
        return ''
    elif (len(re.findall('\.',word.text)) < 1) :
        if word.is_punct:
            return ''
    return word.text


# Function where each token of spacy doc is passed through change_deatils()
def change_article(doc):
    # Passing each token through change_details() function.
    new_tokens = map(change_details,doc)
    new_text = str(' '.join(new_tokens))
    # replace more than one white space in the string with one white space
    new_text = re.sub(' +', ' ',new_text)
    new_text = new_text.replace(' .', '.')
    new_text = new_text.replace('\n', '')
    return new_text

In [None]:
#Spacy Document of the first entry
sample_df2.loc[0,'spacy_doc']

In [None]:
#Applying the function change_article to the first spacy_doc
change_article(sample_df2.loc[0,'spacy_doc'])

' issues wellsfargo accounts 1. bank continue charge overdraft fees online transfers balance <MASK> charged <MASK> fees the last 4 weeks.  2. bank <MASK> double paid employees payroll process direct deposits voided payroll. retrieve funds employees double paid inert debited business account drew account <MASK> records called directed pay employees manually voided payroll. bank refuses accountability.  3. payroll issue bank started taking money rent account cover payroll overdraft created. removed money rent account bounced rent checks business cause rent account overdrawn <MASK>. bank taking accountability feels like want business accounts. bank business financial jeopardy. charged business accounts close <MASK> incorrect overdraft fees the last 8 weeks.'

In [None]:
#Applying the function change_article to the all the spacy_doc in sample_df2
sample_df2['Change_text']= sample_df2['spacy_doc'].progress_apply(lambda x: change_article(x))
print("\n\nText Transformation Completed")

100%|██████████| 54994/54994 [02:58<00:00, 307.38it/s]



Text Transformation Completed





In [None]:
#Split 'Change_text' into substrings whenever whitespace occur
sample_df2['split_words_whitespaces'] = sample_df2['Change_text'].apply(lambda x: x.split())
#Count the number of substrings in 'split_words_whitespaces'
sample_df2['number_of_words'] = sample_df2['split_words_whitespaces'].apply(lambda x: len(x))
#Count the number of charachters in  'Change_text'
sample_df2['number_of_charachters'] = sample_df2['Change_text'].apply(lambda x: len(x))
#Calculate the ratio of number of charachters by number of words
sample_df2['charachters_by_words'] = sample_df2['number_of_charachters'] // sample_df2['number_of_words']
#Count the number of unique strings in 'split_words_whitespaces'
sample_df2['number_of_unique_words'] = sample_df2['split_words_whitespaces'].apply(lambda x : len(set(x)))
#Count the number of '<MASK>' strings in 'Change_text'
sample_df2['number_of_<MASK>'] = sample_df2['Change_text'].apply(lambda x : x.count('<MASK>'))
#Count the number of '<MASK>' by 'number of words'
sample_df2['<MASK>_BY_WORDS'] = sample_df2['number_of_<MASK>']/sample_df2['number_of_words']

sample_df2.info()

In [None]:
#Retain records where the '<MASK>_BY_WORDS' is at most 0.1
sample_df4 = sample_df2[sample_df2['<MASK>_BY_WORDS'].le(0.1)]
sample_df4.info()

## Download the Sample Data

In [None]:
#Download the sample data
sample_df4.to_csv("SAMPLE_DOC_10_PER_MASK.csv", encoding='utf-8', index=False)
print("\n\nDownload Completed")



Download Completed
