In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('noshowappointments.csv')

# Display initial information
print("Initial Data Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Handle missing values (if any)
# For this dataset, there are no missing values, but this is a general approach
# df.fillna(method='ffill', inplace=True)

# Standardize text values
df['Gender'] = df['Gender'].str.upper()
df['No-show'] = df['No-show'].str.upper()

# Convert date columns to datetime
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

# Rename column headers to be lowercase with underscores
df.columns = [col.strip().lower().replace('-', '_') for col in df.columns]

# Fix data types
df['age'] = df['age'].astype(int)
df['scholarship'] = df['scholarship'].astype(int)
df['hipertension'] = df['hipertension'].astype(int)
df['diabetes'] = df['diabetes'].astype(int)
df['alcoholism'] = df['alcoholism'].astype(int)
df['handcap'] = df['handcap'].astype(int)
df['sms_received'] = df['sms_received'].astype(int)

# Handle inconsistent data
# Remove rows with negative age
df = df[df['age'] >= 0]

# Save the cleaned dataset
df.to_csv('cleaned_noshowappointments.csv', index=False)

print("\nData cleaning complete. Cleaned dataset saved as 'cleaned_noshowappointments.csv'.")

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB
None

Missing Values:
PatientId         0
AppointmentID     0
Gender   