In [None]:
import pandas as pd
import numpy as np
import datetime
from time import strftime
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
# Reading the dataset
data = pd.read_csv('Data.csv')

In [None]:
data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
#  modifying the data and time into standard form
data['ScheduledDay'] = pd.to_datetime(data['ScheduledDay']).dt.date.astype('datetime64[ns]')
data['AppointmentDay'] = pd.to_datetime(data['AppointmentDay']).dt.date.astype('datetime64[ns]')

In [None]:
data.head()

Creating new columns for weekdays to identify appointment patterns throughout the week.
For the schedule day and appoinment day, storing the weekdays into a variable.

In [None]:
data['sch_weekday'] = data['ScheduledDay'].dt.dayofweek  # day of the week with Monday=0 ; Sunday=6

In [None]:
data['app_weekday'] = data['AppointmentDay'].dt.dayofweek

In [None]:
data['sch_weekday'].value_counts()

In [None]:
data['app_weekday'].value_counts()

In [None]:
data.columns

In [None]:
#  Changing the names of columns with mistakes
data = data.rename(columns={'Hipertension' : 'Hypertension' , 'Handcap' : 'Handicap' , 'SMS_received' : 'SMSReceived' , 'No-show' : 'NoShow' })

In [None]:
data.columns

In [None]:
data.info()

In [None]:
# dropping some columns which have no significance
data.drop(['PatientId', 'AppointmentID', 'Neighbourhood'], axis=1, inplace=True)

In [None]:
data

In [None]:
data.describe()

In [None]:
# Plot horizontal bar chart
counts = data['NoShow'].value_counts()
ax = counts.plot(kind='barh', figsize=(5,3))
plt.xlabel("Count", labelpad=14)
plt.ylabel("Target Variables",labelpad=14)
plt.title("Count Of Targt Variables per Category", y=1.02)

# Add count labels to each ba
for i, v in enumerate(counts):
    ax.text(v+500, i, str(v), va='center')  # 500 is spacing from bar
    
plt.grid(axis='x', linestyle='--', linewidth=0.7, alpha=0.7)
plt.show()

In [None]:
# Calculating Percentage(%) of Appointments by Attendance Status:
round(100*data['NoShow'].value_counts()/len(data['NoShow']), 2)

In [None]:
data['NoShow'].value_counts()

In [None]:
# Checking whether data contains missing values or not with a visualization

# Calculate percentage of missing values
missing = pd.DataFrame((data.isnull().sum()) * 100 / data.shape[0]).reset_index()
missing.columns = ['Index', 'MissingPercentage']

# Plot
plt.figure(figsize=(14, 7))
ax = sns.pointplot(x='Index', y='MissingPercentage', data=missing)
plt.xticks(rotation=60, fontsize=10)
plt.title("Percentage(%) of Missing Values")
plt.ylabel("PERCENTAGE(%)")  # corrected typo from yLable
plt.xlabel("Index")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

#  A point chart where each point represents how much data is missing in a specific column 
# — helpful for quickly identifying which columns need data cleaning.

In [None]:
# Missing Data - Initial Intuition

# Data Cleaning
Create a copy of base data for manipulation

In [None]:
new_data = data.copy()

In [None]:
new_data.info()

As we don't have any null vales , data clening is not required

In [None]:
# get the max tenure(age)
print(data['Age'].max()) #72

In [None]:
# group the tenure(age) into bins of 20 years
# Creating Age Group Labels
labels =  ["{0} - {1}".format(i, i+20) for i in range(1, 118, 20)]

# group the Age values into bins of 20 years 
data['Age_group'] = pd.cut(data.Age, range(1,130,20), right=False, labels=labels)

In [None]:
data.drop(['Age'], axis=1, inplace=True)

# Data Exploration

In [None]:
list(data.columns)

In [None]:
# Examine the value counts for each predictor column and
# Visualize how each feature relates to the target column (NoShow) using count plots

for i, predictor in enumerate(data.drop(columns=['NoShow'])):
    print('-'*10, predictor,'-'*10)
    
    # Print value counts of the current predictor
    print(data[predictor].value_counts())
    
    # Plot countplot for each predictor with NoShow as hue
    plt.figure(i)
    sns.countplot(data=data, x=predictor, hue='NoShow')
    # plt.title(f'Distribution of {predictor} by NoShow')
    # plt.xticks(rotation=45)
    # plt.tight_layout()
    # plt.show()

In [None]:
data['NoShow'] = np.where(data.NoShow == 'Yes',1,0)

In [None]:
data.NoShow.value_counts()

Convert all Categorical Variables into Dummy Variables

In [None]:
data_dummies = pd.get_dummies(data)
# data_dummies = pd.get_dummies(data, drop_first=True)  # to avoid dummy varaible trap(multicllinearity)

data_dummies.head()
# new columns contain binary values (0 or 1)

- Many machine learning models (like logistic regression, decision trees, etc.) require numeric input only.
- This makes the data ready for model training.

Build a correlation with all predicators with 'NoShow'

In [None]:
plt.figure(figsize=(20, 8))
data_dummies.corr()['NoShow'].sort_values(ascending=False).plot(kind='bar')
plt.tight_layout()
plt.show()

In [None]:
# plt.figure(figsize=(20, 8))
# data_dummies.corr()['NoShow'].sort_values(ascending=False).plot(kind='bar')

# plt.title("Correlation of Features with NoShow", fontsize=16)
# plt.ylabel("Correlation", fontsize=12)
# plt.xticks(rotation=45)
# plt.grid(True, linestyle='--', alpha=0.5)
# plt.tight_layout()

# plt.show()


In [None]:
# plotting the same for Heat Map

plt.figure(figsize=(12, 12))
sns.heatmap(data_dummies.corr(), cmap="Paired")
plt.show()

# Bivariate Analysis

In [None]:
new_df1_target0=data.loc[data["NoShow"]==0]
new_df1_target1=data.loc[data["NoShow"]==1]

In [None]:
# creating bar charts showing frequency of values
def uniplot(df,col,title,hue =None):    # uniplot-custom plotting function 
    
    sns.set_style('whitegrid')
    sns.set_context('talk')  # Bigger fonts and styling for presentations
    
    plt.rcParams["axes.labelsize"] =15
    plt.rcParams['axes.titlesize'] = 18
    plt.rcParams['axes.titlepad'] = 25
    
    temp = pd.Series(data = hue)
    
    fig, ax = plt.subplots()
    width = len(df[col].unique()) + 7 + 4*len(temp.unique())
    
    fig.set_size_inches(width, 8)
    plt.xticks(rotation=45)
    plt.yscale('log')  # Converts y-axis to logarithmic scale
    
    # count plot
    ax = sns.countplot(data = df, x= col, order=df[col].value_counts().index, hue = hue, palette='bright') 

    plt.title(title)
    plt.show()

In [None]:
uniplot(new_df1_target1,col='Hypertension',title='Distribution of Gender for NoShow Customers',hue='Gender')

In [None]:
uniplot(new_df1_target0,col='Age_group',title='Distribution of Age for NoShow Customers',hue='Gender')

# Findings
1. Female patients have taken more appointments then male patients
2. Ratio of Nohow and Show is almost equal for age group except Age 0 and Age 1 with 80% show rate for each age group
3. Each Neighbourhood have almost 80% show rate
4. There are 99666 patients without Scholarship and out of them around 80% have come for the visit and out of the 21801 patients with Scholarship around 75% of them have come for the visit.
5. there are around 88,726 patients without Hypertension and out of them around 78% have come for the visit and Out of the 21801 patients with Hypertension around 85% of them have come for the visit.
6. there are around 102,584 patients without Diabetes and out of them around 80% have come for the visit and Out of the 7,943 patients with Diabetes around 83% of them have come for the visit.
7. there are around 75,045 patients who have not received SMS and out of them around 84% have come for the visit and out of the 35,482 patients who have received SMS around 72% of them have come for the visit. 
8. there is no appointments on sunday and on saturday appointments are very less in comparision to other week days 
