In [1]:
from appointment_bot.features.build_features import load_raw, preprocess

df_raw = load_raw()        # reads data/raw/noshow.csv
df = preprocess(df_raw)    # applies all cleaning and feature steps


ModuleNotFoundError: No module named 'appointment_bot'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram of age
df['Age'].hist()
plt.title('Distribution of Age')
plt.show()

# Bar plot of gender
df['Gender'].value_counts().plot(kind='bar')
plt.title('Gender Distribution')
plt.show()

# Bar plot of SMS_received
df['SMS_received'].value_counts().plot(kind='bar')
plt.title('SMS Received Distribution')
plt.show()

# Bar plot of no-show
df['No-show'].value_counts().plot(kind='bar')
plt.title('No-Show Distribution')
plt.show()

# Box plot of lead_time_days by no-show
sns.boxplot(x='No-show', y='lead_time_days', data=df)
plt.title('Lead Time by No-Show Status')
plt.show()

# Extract day of the week
df['appointment_day_of_week'] = df['AppointmentDay'].dt.day_name()

# No-show rates by day of the week
no_show_by_day = df.groupby('appointment_day_of_week')['No-show'].mean()
no_show_by_day.sort_values().plot(kind='bar')
plt.title('No-Show Rates by Day of the Week')
plt.show()

# Extract hour of the day
df['appointment_hour'] = df['AppointmentDay'].dt.hour

# No-show rates by hour
no_show_by_hour = df.groupby('appointment_hour')['No-show'].mean()
no_show_by_hour.sort_index().plot()
plt.title('No-Show Rates by Hour of the Day')
plt.show()

In [None]:
df['is_weekend'] = df['AppointmentDay'].dt.dayofweek.isin([5,6]).astype(int)
df['time_of_day'] = df['appointment_hour'].apply(lambda x: 'morning' if 6<=x<12 else 'afternoon' if 12<=x<18 else 'evening' if 18<=x<24 else 'night')
bins = [0, 18, 65, 100]
labels = ['young', 'adult', 'senior']
df['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels)

# One-hot encode categorical variables
day_dummies = pd.get_dummies(df['appointment_day_of_week'], prefix='day')
time_dummies = pd.get_dummies(df['time_of_day'], prefix='time')
age_group_dummies = pd.get_dummies(df['age_group'], prefix='age_group')

df = pd.concat([df, day_dummies, time_dummies, age_group_dummies], axis=1)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Select features and target
features = ['Gender', 'Age', 'SMS_received', 'lead_time_days', 'is_weekend'] + list(day_dummies.columns) + list(time_dummies.columns) + list(age_group_dummies.columns)
X = df[features]
y = df['No-show']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
logreg = LogisticRegression()
logreg_scores = cross_val_score(logreg, X, y, cv=5, scoring='accuracy')
print(f"Logistic Regression Accuracy: {logreg_scores.mean():.2f} ± {logreg_scores.std():.2f}")

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')
print(f"Random Forest Accuracy: {rf_scores.mean():.2f} ± {rf_scores.std():.2f}")

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Train models
logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Predict on test set
y_pred_logreg = logreg.predict(X_test)
y_pred_rf = rf.predict(X_test)

# Confusion matrices
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Other metrics
print("Logistic Regression Precision:", precision_score(y_test, y_pred_logreg))
print("Logistic Regression Recall:", recall_score(y_test, y_pred_logreg))
print("Logistic Regression F1 Score:", f1_score(y_test, y_pred_logreg))

print("Random Forest Precision:", precision_score(y_test, y_pred_rf))
print("Random Forest Recall:", recall_score(y_test, y_pred_rf))
print("Random Forest F1 Score:", f1_score(y_test, y_pred_rf))