In [None]:

#  Step 1. Import  Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob

# For reproducibility
np.random.seed(42)

-
# Step 2. Load Data from CSV File

# Replace with your real CSV file path
df = pd.read_csv("patient_feedback.csv")

print("🔍 Raw Data Sample")
display(df.head())

# Step 3. Data Understanding (EDA - Quick Check)

print("\n📏 Dataset Shape:", df.shape)
print("\n🧾 Column Info:")
print(df.info())

print("\n❓ Missing Values Check:")
print(df.isnull().sum())

# Feedback length distribution
df["feedback_length"] = df["feedback"].apply(lambda x: len(str(x)))
sns.histplot(df["feedback_length"], bins=10, kde=True)
plt.title("Feedback Length Distribution")
plt.show()


# Step 4. Data Preparation (Preprocessing)

def clean_text(text):
    # Handle missing values
    if pd.isnull(text):
        return ""
    # Lowercase
    text = text.lower()
    # Remove special characters/numbers
    text = re.sub(r"[^a-z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_feedback"] = df["feedback"].apply(clean_text)

# Sentiment Scoring (TextBlob for demo)
df["sentiment_polarity"] = df["clean_feedback"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["sentiment_label"] = df["sentiment_polarity"].apply(
    lambda x: "Positive" if x > 0 else ("Negative" if x < 0 else "Neutral")
)

print("\n✅ After Cleaning & Sentiment Scoring:")
display(df[["patient_id", "feedback", "clean_feedback", "sentiment_label"]])


#  Step 5. Data Quality Checks

# 1. Duplicates
duplicates = df.duplicated().sum()
print(f"🔁 Duplicate Records: {duplicates}")

# 2. Distribution of Sentiments
sns.countplot(x="sentiment_label", data=df)
plt.title("Sentiment Distribution")
plt.show()

# 3. Anonymization check (drop patient_id for privacy)
df_anonymized = df.drop(columns=["patient_id"])
print("\n🔒 Anonymized Data Preview:")
display(df_anonymized.head())

# Step 6. Save Clean Data (Ready for Modeling)

df_anonymized.to_csv("prepared_feedback.csv", index=False)
print("💾 Clean data exported as 'prepared_feedback.csv'")
