In [1]:
# Import necessary libraries
import pandas as pd

In [2]:
file_path = '../data/patient_no_show_dataset_result.csv'
df = pd.read_csv(file_path)

In [3]:
print(df.head())

   Patient ID  Age      gender    Ethnicity Socioeconomic Status  \
0           1   56      Female  Ethnicity C                  Low   
1           2   69        Male  Ethnicity C                  Low   
2           3   46  Non-Binary  Ethnicity D               Medium   
3           4   32      Female  Ethnicity A                  Low   
4           5   60        Male  Ethnicity A                  Low   

   Distance to Facility  Previous No-shows Appointment No-show prediction  
0                  3.13                  1                  No        Yes  
1                 10.81                  2                 Yes         No  
2                 15.94                  0                 Yes         No  
3                 18.91                  0                  No        Yes  
4                 15.19                  3                  No         No  


In [4]:
# Provides a summary of the DataFrame, including the number of non-null entries and data types of columns.
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Patient ID            1000 non-null   int64  
 1   Age                   1000 non-null   int64  
 2   gender                1000 non-null   object 
 3   Ethnicity             1000 non-null   object 
 4   Socioeconomic Status  1000 non-null   object 
 5   Distance to Facility  1000 non-null   float64
 6   Previous No-shows     1000 non-null   int64  
 7   Appointment No-show   1000 non-null   object 
 8   prediction            1000 non-null   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 70.4+ KB
None


In [5]:
# Generates descriptive statistics for numerical columns.
print(df.describe())

        Patient ID          Age  Distance to Facility  Previous No-shows
count  1000.000000  1000.000000           1000.000000        1000.000000
mean    500.500000    50.380000             10.575860           0.975000
std     288.819436    18.378666              5.451377           1.012621
min       1.000000    18.000000              1.030000           0.000000
25%     250.750000    35.000000              5.835000           0.000000
50%     500.500000    50.000000             10.730000           1.000000
75%     750.250000    66.000000             15.375000           2.000000
max    1000.000000    80.000000             19.950000           5.000000


In [6]:
# Returns the dimensions of the DataFrame (rows, columns).
print(df.shape)

(1000, 9)


In [7]:
#  Lists the column names.
print(df.columns)

Index(['Patient ID', 'Age', 'gender', 'Ethnicity', 'Socioeconomic Status',
       'Distance to Facility', 'Previous No-shows', 'Appointment No-show',
       'prediction'],
      dtype='object')


In [8]:
# Shows the data types of each column.
print(df.dtypes)

Patient ID                int64
Age                       int64
gender                   object
Ethnicity                object
Socioeconomic Status     object
Distance to Facility    float64
Previous No-shows         int64
Appointment No-show      object
prediction               object
dtype: object


In [9]:
# check the distribution of categorical data:
print(df['gender'].value_counts())

gender
Female        454
Male          450
Non-Binary     96
Name: count, dtype: int64


In [10]:
# Number of unique values in column
for column in df.columns:
    num_distinct_types = df[column].nunique()
    print(f"Number of distinct types in {column}: {num_distinct_types}")

Number of distinct types in Patient ID: 1000
Number of distinct types in Age: 63
Number of distinct types in gender: 3
Number of distinct types in Ethnicity: 4
Number of distinct types in Socioeconomic Status: 3
Number of distinct types in Distance to Facility: 765
Number of distinct types in Previous No-shows: 6
Number of distinct types in Appointment No-show: 2
Number of distinct types in prediction: 2


In [11]:
# list the count of distinc values
for column in df.columns:    
    print(df[column].value_counts())

Patient ID
1000    1
1       1
2       1
3       1
4       1
       ..
12      1
13      1
14      1
15      1
16      1
Name: count, Length: 1000, dtype: int64
Age
79    29
75    27
77    25
52    25
45    24
      ..
55     9
27     9
51     8
73     8
60     7
Name: count, Length: 63, dtype: int64
gender
Female        454
Male          450
Non-Binary     96
Name: count, dtype: int64
Ethnicity
Ethnicity A    391
Ethnicity B    315
Ethnicity C    201
Ethnicity D     93
Name: count, dtype: int64
Socioeconomic Status
High      345
Low       328
Medium    327
Name: count, dtype: int64
Distance to Facility
8.88     4
18.91    4
1.86     4
10.75    4
18.86    3
        ..
5.17     1
1.39     1
7.72     1
6.75     1
4.19     1
Name: count, Length: 765, dtype: int64
Previous No-shows
0    392
1    348
2    176
3     64
4     17
5      3
Name: count, dtype: int64
Appointment No-show
Yes    587
No     413
Name: count, dtype: int64
prediction
No     546
Yes    454
Name: count, dtype: int64


In [12]:
# Check for missing values
pd.isnull(df).any()

Patient ID              False
Age                     False
gender                  False
Ethnicity               False
Socioeconomic Status    False
Distance to Facility    False
Previous No-shows       False
Appointment No-show     False
prediction              False
dtype: bool