# Data Confidentiality in Text Data

Anonymizing sensitive information in text data

In [4]:
import re

# Example text
text = "My name is Deep Shah."

# Simple pattern to match names
name_pattern = r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'

# Replaces names with '[NAME]'
anonymized_text = re.sub(name_pattern, '[NAME]', text)

print(anonymized_text)

My name is [NAME].


Noise Removal in Text Data

In [6]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

text = "It's a beautiful day!"

text_no_punc = re.sub(r'[^\w\s]','', text)

tokens = text_no_punc.split()

stop_words = set(stopwords.words('english'))

filtered_text = [word for word in tokens if not word.lower() in stop_words]

print(filtered_text)

['beautiful', 'day']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Analyzing a Text Dataset Addressing Missing Data and Noise

In [7]:
import pandas as pd
import re

In [8]:
# Load Dataset
file_path = 'Qualitative data.csv'
data = pd.read_csv(file_path)
print(data.head())

   ID Social Media Activity Questionnaire Completions Product Feedback  \
0   1                Active                 Completed         Positive   
1   2              Inactive             Not Completed          Neutral   
2   3                Active                 Completed         Negative   
3   4                Active                 Completed         Positive   
4   5              Inactive             Not Completed         Positive   

          Notes From Previous Conversations  
0                Interested in new features  
1               Asked about pricing options  
2        Expressed concerns about usability  
3            Recommended product to friends  
4  Asked for additional product information  


### Identify Missing Values

In [10]:
print(data.isnull().sum())

ID                                   0
Social Media Activity                0
Questionnaire Completions            0
Product Feedback                     0
Notes From Previous Conversations    0
dtype: int64


### Handle Missing Values

In [11]:
# Handling missing data (fill with a placeholder)
data.fillna('Missing', inplace=True)

In [12]:
# Handling missing data (drop missing value)
data.dropna(inplace=True)

### Identify and Clean Noise

In [13]:
# Function to clean noise from text
def clean_noise(text):
    # remove special characters, number, punctuations except for emojis
    text = re.sub(r'[^a-zA-Z\s\U0001F600-\U0001F64F]', '', text)
    return text

In [14]:
data['Cleaned_Text'] = data['Notes From Previous Conversations'].apply(clean_noise)

print(data['Cleaned_Text'].head())

0                  Interested in new features
1                 Asked about pricing options
2          Expressed concerns about usability
3              Recommended product to friends
4    Asked for additional product information
Name: Cleaned_Text, dtype: object
