# Data Confidentiality in Text Data

Anonymizing sensitive information in text data

In [4]:
import re

# Example text
text = "My name is Deep Shah."

# Simple pattern to match names
name_pattern = r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'

# Replaces names with '[NAME]'
anonymized_text = re.sub(name_pattern, '[NAME]', text)

print(anonymized_text)

My name is [NAME].


Noise Removal in Text Data

In [6]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

text = "It's a beautiful day!"

text_no_punc = re.sub(r'[^\w\s]','', text)

tokens = text_no_punc.split()

stop_words = set(stopwords.words('english'))

filtered_text = [word for word in tokens if not word.lower() in stop_words]

print(filtered_text)

['beautiful', 'day']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Analyzing a Text Dataset Addressing Missing Data and Noise

In [7]:
import pandas as pd
import re

In [8]:
# Load Dataset
file_path = 'Qualitative data.csv'
data = pd.read_csv(file_path)
print(data.head())

   ID Social Media Activity Questionnaire Completions Product Feedback  \
0   1                Active                 Completed         Positive   
1   2              Inactive             Not Completed          Neutral   
2   3                Active                 Completed         Negative   
3   4                Active                 Completed         Positive   
4   5              Inactive             Not Completed         Positive   

          Notes From Previous Conversations  
0                Interested in new features  
1               Asked about pricing options  
2        Expressed concerns about usability  
3            Recommended product to friends  
4  Asked for additional product information  


### Identify Missing Values

In [10]:
print(data.isnull().sum())

ID                                   0
Social Media Activity                0
Questionnaire Completions            0
Product Feedback                     0
Notes From Previous Conversations    0
dtype: int64


### Handle Missing Values

In [11]:
# Handling missing data (fill with a placeholder)
data.fillna('Missing', inplace=True)

In [12]:
# Handling missing data (drop missing value)
data.dropna(inplace=True)

### Identify and Clean Noise

In [13]:
# Function to clean noise from text
def clean_noise(text):
    # remove special characters, number, punctuations except for emojis
    text = re.sub(r'[^a-zA-Z\s\U0001F600-\U0001F64F]', '', text)
    return text

In [14]:
data['Cleaned_Text'] = data['Notes From Previous Conversations'].apply(clean_noise)

print(data['Cleaned_Text'].head())

0                  Interested in new features
1                 Asked about pricing options
2          Expressed concerns about usability
3              Recommended product to friends
4    Asked for additional product information
Name: Cleaned_Text, dtype: object


## Advanced Text Preprocessing

### The Lemmatization Algorithm

In [15]:
import nltk
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

sentence = "The runners were sprinting during the final few miles of the competition"

lemmatized = ' '.join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(sentence)])

print(lemmatized)

The runner were sprinting during the final few mile of the competition


### Various Tokenization Techniques

In [18]:
import nltk
from nltk.util import ngrams

sentence = "The goalkeeper made a great save during the game"

tokens = nltk.word_tokenize(sentence)

biggrams = list(ngrams(tokens, 2))

print(biggrams)

[('The', 'goalkeeper'), ('goalkeeper', 'made'), ('made', 'a'), ('a', 'great'), ('great', 'save'), ('save', 'during'), ('during', 'the'), ('the', 'game')]


### Language Detection and Handling Specialized Toeknizers

In [21]:
# !pip install langdetect
from langdetect import detect
import nltk

from nltk.tokenize import word_tokenize

sentence = "Il ne faut rien laisser au hasard."

lang = detect(sentence)

if lang == "fr":
    nltk.download('punkt')
    tokens = word_tokenize(sentence, language="french")
    print(f"Tokens: {tokens}")

Tokens: ['Il', 'ne', 'faut', 'rien', 'laisser', 'au', 'hasard', '.']


[nltk_data] Downloading package punkt to /Users/deepshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Normalization and Tokenization Techniques

In [22]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[K     |████████████████████████████████| 431 kB 840 kB/s eta 0:00:01
Installing collected packages: emoji
Successfully installed emoji-2.12.1


In [23]:
import pandas as pd
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/deepshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
# Load
file_path = 'Qualitative data.csv'
data = pd.read_csv(file_path)

In [26]:
# Function to normalize text and handle emojis
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove special characters (keeping spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

data['Normalized_Text'] = data['Notes From Previous Conversations'].apply(normalize_text)

In [27]:
# Tokenize
# data['Tokens'] = data['Normalized_Text'].str.split()

# Using NLTK
data['Tokens'] = data['Normalized_Text'].apply(word_tokenize)

In [28]:
data[['Normalized_Text', 'Tokens']].head()

Unnamed: 0,Normalized_Text,Tokens
0,interested in new features,"[interested, in, new, features]"
1,asked about pricing options,"[asked, about, pricing, options]"
2,expressed concerns about usability,"[expressed, concerns, about, usability]"
3,recommended product to friends,"[recommended, product, to, friends]"
4,asked for additional product information,"[asked, for, additional, product, information]"
