In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  
import sklearn as skl
import scipy
import statsmodels.api as sm

In [None]:
donor_df=pd.read_csv('../data/blooddonerdataset.csv')

## Undertand the Dataset (Data Familiarization)

### First Look at the Data

In [None]:
donor_df.head()

In [None]:
donor_df.tail()

In [None]:
donor_df.sample(5)
#generate random rows

In [None]:
donor_df.shape
#(rows,columns)

In [None]:
donor_df.columns
# give columns names

In [None]:
donor_df.index

### Data Types & Schema

In [None]:
donor_df.dtypes

In [None]:
donor_df.info()

### Understanding Values in Columns

In [None]:
donor_df.describe()

In [None]:
donor_df['blood_group'].value_counts()
# donor_df['blood_group'].unique() give unique value in column
# donor_df['blood_group'].nunique() give number of unique value in column

In [None]:
donor_df['location_name'].value_counts()

In [None]:
donor_df['donor_status'].value_counts()

In [None]:
donor_df['age'].value_counts()

In [None]:
donor_df['donor_type'].value_counts()

In [None]:
donor_df['rh_factor'].value_counts()

In [None]:
donor_df['hemoglobin_level'].value_counts()

In [None]:
donor_df['health_clearance'].value_counts()

In [None]:
donor_df['temporary_deferral_reason'].value_counts()

In [None]:
donor_df['nearest_hospital'].value_counts()

In [None]:
donor_df['preferred_donation_center'].value_counts()

In [None]:
donor_df['availability_time'].value_counts()

In [None]:
donor_df['preferred_contact_method'].value_counts()

In [None]:
donor_df['verified_status'].value_counts()

In [None]:
donor_df['verified_by'].value_counts()

In [None]:
donor_df['donation_count'].value_counts()

In [None]:
donor_df['donation_frequency_per_year'].value_counts()

In [None]:
donor_df['eligibility_status'].value_counts()

In [None]:
donor_df['eligibility_reason'].value_counts()

In [None]:
donor_df['record_status'].value_counts()        

In [None]:
donor_df['response_time_minutes'].value_counts()        

### Missing Values

In [None]:
donor_df.isnull().sum()

In [None]:
donor_df.isna().mean() *100
#perceentage missing

### Duplicates

In [None]:
donor_df.duplicated().sum()
# counts number of duplicated rows in dataframe

### Outliers & Data Range

In [None]:
donor_df["age"].min()

In [None]:
donor_df["age"].max()

In [None]:
donor_df["age"].quantile([0.01,0.99])
#used to calculate the 1st and 99th percentiles

In [None]:
donor_df["hemoglobin_level"].quantile([0.01,0.99])

In [None]:
donor_df["weight_kg"].quantile([0.01,0.99])

In [None]:
donor_df["donation_count"].quantile([0.01,0.99])

In [None]:
donor_df["donation_frequency_per_year"].quantile([0.01,0.99])

In [None]:
donor_df["response_time_minutes"].quantile([0.01,0.99])

### Relationships Between Columns 

In [None]:
donor_df.corr(numeric_only=True)

In [None]:
donor_df.groupby('age')[['weight_kg','hemoglobin_level','donation_count','donation_frequency_per_year']].mean()

### Validate Data Quality Rules

In [None]:
(donor_df['age'] >=0 ).all()

In [None]:
(donor_df['weight_kg'] >=0 ).all()

In [None]:
(donor_df['hemoglobin_level'] >=0 ).all()

In [None]:
(donor_df['next_eligible_date']>=donor_df['last_donation_date']).all()

In [None]:
donor_df.nunique().sort_values(ascending=False)

### Frequncy & Distribution Shape

In [None]:
donor_df['blood_group'].value_counts(normalize=True)

In [None]:
donor_df.nunique()==1

### Analyze Text Columns

In [None]:
donor_df['name'].str.len().describe()

### Row-Wise Analysis

In [None]:
donor_df.isnull().sum(axis=1).describe()

### Feature Interaction Checks

In [None]:
pd.crosstab(donor_df['age'],donor_df["blood_group"])

In [None]:
pd.crosstab(donor_df['location_name'],donor_df["blood_group"])

In [None]:
pd.crosstab(donor_df['location_name'],donor_df["donor_status"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["rh_factor"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["health_clearance"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["hemoglobin_level"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["temporary_deferral_reason"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["nearest_hospital"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["preferred_donation_center"])

In [None]:
pd.crosstab(donor_df['location_name'],donor_df["preferred_donation_center"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["data_source"])

In [None]:
pd.crosstab(donor_df['blood_group'],donor_df["verified_status"])

### Identify Data Leakage

In [None]:
donor_df.corr(numeric_only=True)['age'].sort_values(ascending=False)

### Detect Inconsistent Labels

In [None]:
donor_df['location_name'].str.lower().value_counts()

### Check Sampling Bias

In [None]:
donor_df.groupby('blood_group').size()

In [None]:
donor_df.groupby('donor_status')['age'].mean()

In [None]:
donor_df.groupby('nearest_hospital')['donation_count'].mean()

In [None]:
donor_df.groupby('nearest_hospital')['donation_frequency_per_year'].mean()

In [None]:
donor_df.groupby('preferred_donation_center')['donation_count'].mean()

In [None]:
donor_df.groupby('preferred_donation_center')['donation_frequency_per_year'].mean()

In [None]:
donor_df.groupby('rh_factor')['hemoglobin_level'].mean()

In [None]:
donor_df.groupby('blood_group')['hemoglobin_level'].mean()

In [None]:
donor_df.groupby('blood_group')['response_time_minutes'].mean()

In [None]:
donor_df.groupby('nearest_hospital')['response_time_minutes'].mean()

In [None]:
donor_df.groupby('preferred_donation_center')['response_time_minutes'].mean()

In [None]:
donor_df.groupby('preferred_donation_center')['days_since_last_donation'].mean()

### Date & Time Analysis

In [None]:
donor_df['last_donation_date']=pd.to_datetime(donor_df['last_donation_date'],errors='coerce')
donor_df['last_donation_date'].isna().sum()

In [None]:
donor_df['last_donation_date'].min()

In [None]:
donor_df['last_donation_date'].max()

In [None]:
donor_df=donor_df.sort_values('last_donation_date')
donor_df['last_donation_date'].diff().value_counts().head()

### Basic Plot

### Missing Value Plot

In [None]:
donor_df.isnull().sum().plot(kind='bar',figsize=(10,4))
plt.title("Missing values per Column")
plt.ylabel("Count")
plt.xlabel("Columns")
plt.show()

### Uniqueness & Duplicates

In [None]:
donor_df.nunique().sort_values(ascending=False)

In [None]:
donor_df.duplicated().sum()

### Data Consistency Checks

In [None]:
donor_df=donor_df[donor_df["donation_count"]>=0]

In [None]:
donor_df =donor_df[donor_df["next_eligible_date"]>=donor_df["last_donation_date"]]

In [None]:
donor_df=donor_df[donor_df["donation_count"]>=0]

### Range & Constraint Validataion

In [None]:
donor_df=donor_df[donor_df["age"].between(18,65)]

In [None]:
donor_df=donor_df[donor_df["weight_kg"]>=50]

In [None]:
donor_df=donor_df[donor_df["hemoglobin_level"].between(13,17)]

In [None]:
donor_df["hb_missing"]=donor_df["hemoglobin_level"].isnull()
donor_df["hb_missing"]

### Record Uniqueness Validation

In [None]:
donor_df["donor_id"].is_unique

In [None]:
donor_df["email"].is_unique

In [None]:
donor_df["contact_number"].is_unique
donor_df

### Sorting & Index Reset

In [None]:
donor_df =donor_df.sort_values("registration_date")
donor_df.reset_index(drop=True,inplace=True)
donor_df

### Metadata & Data Dictionary Creation

In [None]:
donor_df.attrs['column_descriptions'] = {
    "name": "Full name of the blood donor or receiver.",
    "donor_id": "Unique identification number assigned to each donor or receiver.",
    "blood_group": "ABO blood group of the donor or receiver (A, B, AB, or O).",
    "rh_factor": "Rhesus (Rh) factor of the blood group (Positive + or Negative -).",
    "location_name": "Temporary or current address/location of the donor or receiver.",
    "latitude": "Latitude coordinate of the donor/receiver location.",
    "longitude": "Longitude coordinate of the donor/receiver location.",
    "email": "Email address used for communication and notifications.",
    "contact_number": "Phone number of the donor or receiver.",
    "age": "Age of the donor or receiver in years.",
    "weight_kg": "Body weight of the donor in kilograms.",
    "hemoglobin_level": "Hemoglobin level of the donor (g/dL), used to assess donation eligibility.",
    "donor_status": "Indicates whether the person is an active donor, inactive donor, or receiver.",
    "donor_type": "Type of donor (e.g., voluntary, replacement, emergency).",
    "last_donation_date": "Date when the donor last donated blood.",
    "next_eligible_date": "Date when the donor becomes eligible for the next blood donation.",
    "health_clearance": "Medical approval status indicating whether the donor is fit to donate.",
    "temporary_deferral_reason": "Reason for temporary deferral (e.g., illness, low hemoglobin).",
    "nearest_hospital": "Name of the closest hospital.",
    "preferred_donation_center": "Preferred blood donation center selected by the donor.",
    "availability_time": "Time period when the donor is available for donation.",
    "preferred_contact_method": "Preferred mode of contact (Call, SMS, Email).",
    "data_source": "Source of the data (e.g., Mobile App, Hospital Record, Manual Entry).",
    "verified_status": "Indicates whether donor information has been verified (Yes/No).",
    "verified_by": "Authority or person who verified the donor information.",
    "registration_date": "Date when the donor or receiver registered in the system.",
    "donation_count": "Total number of times the donor has donated blood.",
    "donation_frequency_per_year": "Average number of donations made per year.",
    "days_since_last_donation": "Number of days passed since the last blood donation.",
    "eligibility_status": "Current eligibility of the donor (Eligible / Not Eligible / Temporarily Deferred).",
    "eligibility_reason": "Reason explaining the eligibility or ineligibility status.",
    "distance_to_center_km": "Distance (in kilometers) from donor location to the donation center.",
    "response_time_minutes": "Estimated time (in minutes) taken by donor to respond to a request.",
    "record_status": "Current status of the record (Active, Inactive, Archived, Deleted)."
}
donor_df

### Validatain Summaray Table

In [None]:
cleaning_report=pd.DataFrame({
    "missing_affter":donor_df.isnull().sum(),
    "dtype":donor_df.dtypes
})
cleaning_report

### Reproducibility Checks

In [None]:
donor_df=donor_df.copy()

### Saved Cleaned Version of Data

In [None]:
donor_df.to_csv("data_clean_v1.csv",index=False)