In [None]:
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
# import pandas as pd


: 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("akshayas02/menstrual-cycle-data-with-factors-dataset")

print("Path to dataset files:", path)


In [None]:
file_name = "menstrual_cycle_dataset_with_factors.csv"
df = pd.read_csv(os.path.join(path, file_name))


In [None]:
df.head()

In [None]:
df.info()
df.describe(include="all")
df.isna().sum()

In [None]:

#enable to time based calculation
df["Cycle Start Date"] = pd.to_datetime(df["Cycle Start Date"])
df["Next Cycle Start Date"] = pd.to_datetime(df["Next Cycle Start Date"])
#bringing to date time



In [None]:
df.head()

In [None]:
# Creating a target variable
df["days_until_next_period"] = (
    df["Next Cycle Start Date"] - df["Cycle Start Date"]
).dt.days

In [None]:
df.head()

In [None]:
#removing "Next Cycle Start Date"
df.drop(columns=["Next Cycle Start Date"], inplace=True)
df.head()

In [None]:
#removing impossible target data (avoid negative and zero)
df = df[df["days_until_next_period"] > 0]

In [None]:
df.isna().any().any()

In [None]:
#checking the missing value heat map
# import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(df.isna(), cbar=False)
plt.title("Missing values heatmap")
plt.show()

In [None]:
#handling extremes
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4)) #plot size
sns.histplot(df["Cycle Length"], kde=True) # for statistic plotting
plt.title("Distribution of Cycle Length") #for mathematical plotting
plt.xlabel("Cycle Length (days)")
plt.show()


In [None]:
num_cols = [
    "Age",
    "BMI",
    "Sleep Hours",
    "Cycle Length",
    "Period Length",
    "Stress Level"
]

for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.show()


In [None]:
plt.figure(figsize=(6,3))
sns.boxplot(x=df["Cycle Length"])
plt.title("Cycle Length Boxplot")
plt.show()


In [None]:
# #all the data is within the range, but adding clipping to future profing the code
df["Age"] = df["Age"].clip(12, 45)
df["BMI"] = df["BMI"].clip(15, 45)
df["Sleep Hours"] = df["Sleep Hours"].clip(5, 9)
df["Cycle Length"] = df["Cycle Length"].clip(20, 50)
df["Period Length"] = df["Period Length"].clip(2, 7)


In [None]:
#standadiing lables
df["Exercise Frequency"] = df["Exercise Frequency"].str.lower()
df["Diet"] = df["Diet"].str.lower()
df["Symptoms"] = df["Symptoms"].str.lower()


In [None]:
df.head()

In [None]:
cat_cols = ["Exercise Frequency", "Diet", "Symptoms"]
#df[cat_cols] = df[cat_cols].fillna("Unknown")


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df[num_cols + cat_cols]
y = df["days_until_next_period"]

X_train, X_test, y_train, y_test = train_test_split( #80% 20% (Split three 70 20 10 )
    X, y, test_size=0.2, random_state=42
)

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)


In [None]:
feature_names = preprocessor.get_feature_names_out()
print(feature_names)


In [None]:
# import numpy as np

X_train_encoded = X_train_encoded.toarray() if hasattr(X_train_encoded, "toarray") else np.array(X_train_encoded) # convert to numpy array
X_test_encoded  = X_test_encoded.toarray()  if hasattr(X_test_encoded, "toarray")  else np.array(X_test_encoded)

y_train = y_train.values # converting column in to array
y_test = y_test.values


In [None]:
import tensorflow as tf

tf.random.set_seed(42)

nn_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_encoded.shape[1],)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(16, activation="relu"),
    tf.keras.layers.Dense(1)  # regression output = predicted days
])

nn_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="mse",
    metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_mae",
    patience=10,
    restore_best_weights=True
)

history = nn_model.fit(
    X_train_encoded, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)


In [None]:
from sklearn.metrics import mean_absolute_error

pred_days = nn_model.predict(X_test_encoded).flatten()
mae_nn = mean_absolute_error(y_test, pred_days)

print("Neural Network MAE (days):", round(mae_nn, 3))


In [None]:
baseline_pred = X_test["Cycle Length"].astype(float).values
mae_baseline = mean_absolute_error(y_test, baseline_pred)

print("Baseline MAE (days):", round(mae_baseline, 3))
print("NN improvement (days):", round(mae_baseline - mae_nn, 3))


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
plt.plot(history.history["mae"], label="Train MAE")
plt.plot(history.history["val_mae"], label="Validation MAE")
plt.xlabel("Epoch")
plt.ylabel("MAE (days)")
plt.title("Neural Network Training Curve")
plt.legend()
plt.show()


In [None]:
print("Baseline MAE:", mae_baseline)
print("Neural Network MAE:", mae_nn)


In [None]:
# Predict on test data
y_pred = nn_model.predict(X_test_encoded).flatten()

tolerance = 1  # days

accuracy_1day = np.mean(np.abs(y_test - y_pred) <= tolerance)

print(f"Accuracy within ±{tolerance} day:", accuracy_1day)

tolerance = 2  # days

accuracy_2day = np.mean(np.abs(y_test - y_pred) <= tolerance)

print(f"Accuracy within ±{tolerance} days:", accuracy_2day)

print(f"Accuracy (±2 days): {accuracy_2day * 100:.2f}%")



In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

mae_scores = []


# Cross-validation loop
for train_index, val_index in kf.split(X):
    X_train_cv, X_val_cv = X.iloc[train_index], X.iloc[val_index]
    y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

    # Preprocessing
    X_train_encoded = preprocessor.fit_transform(X_train_cv)
    X_val_encoded = preprocessor.transform(X_val_cv)

    # Build model
    nn_model_cv = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(X_train_encoded.shape[1],)),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1)
    ])
    nn_model_cv.compile(
        optimizer=tf.keras.optimizers.Adam(0.001),
        loss="mse",
        metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
    )

    # Train with early stopping
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_mae",
        patience=10,
        restore_best_weights=True
    )

    nn_model_cv.fit(
        X_train_encoded, y_train_cv,
        validation_data=(X_val_encoded, y_val_cv),
        epochs=100,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )

    # Evaluate
    _, val_mae = nn_model_cv.evaluate(X_val_encoded, y_val_cv, verbose=0)
    mae_scores.append(val_mae)


# Cross-validation results
print(f"{n_splits}-Fold Cross-Validation MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.metrics import classification_report


# Predict days (regression output)
y_pred_days = nn_model.predict(X_test_encoded).flatten()

# Round predictions to nearest day
y_pred_days_rounded = np.rint(y_pred_days).astype(int)

# Clip predictions to valid range
y_pred_days_rounded = np.clip(y_pred_days_rounded, 25, 50)

# Ensure true values are integers
y_test_days = y_test.astype(int)


#Define day labels (25–50)
day_labels = list(range(25, 51))  # 26 classes


# Confusion Matrix
cm = confusion_matrix(
    y_test_days,
    y_pred_days_rounded,
    labels=day_labels
)

print("Confusion Matrix shape:", cm.shape)

report_dict = classification_report(
    y_test_days,
    y_pred_days_rounded,
    labels=day_labels,
    output_dict=True,
    zero_division=0
)

report_df = pd.DataFrame(report_dict).T
display(report_df)


# Plot Confusion Matrix
plt.figure(figsize=(16,14))
sns.heatmap(
    cm,
    cmap="Blues",
    xticklabels=day_labels,
    yticklabels=day_labels,
    annot=False   # set True only if dataset is small
)

plt.xlabel("Predicted Day")
plt.ylabel("Actual Day")
plt.title("Confusion Matrix for Days (25–50)")
plt.tight_layout()
plt.show()




In [None]:
def predict_next_period_nn(user_input: dict):
    """
    user_input must include:
    - num_cols + cat_cols
    - Cycle Start Date (YYYY-MM-DD) to convert predicted days -> date
    """
    # Normalise categorical inputs to match training
    ex = str(user_input["Exercise Frequency"]).lower().strip()
    diet = str(user_input["Diet"]).lower().strip()
    sym = str(user_input["Symptoms"]).lower().strip()

    X_one = pd.DataFrame([{
        "Age": user_input["Age"],
        "BMI": user_input["BMI"],
        "Stress Level": user_input["Stress Level"],
        "Sleep Hours": user_input["Sleep Hours"],
        "Cycle Length": user_input["Cycle Length"],
        "Period Length": user_input["Period Length"],
        "Exercise Frequency": ex,
        "Diet": diet,
        "Symptoms": sym,
    }])

    X_one_enc = preprocessor.transform(X_one)
    X_one_enc = X_one_enc.toarray() if hasattr(X_one_enc, "toarray") else np.array(X_one_enc)

    pred_days = float(nn_model.predict(X_one_enc, verbose=0).flatten()[0])
    pred_days = max(1.0, pred_days)

    cycle_start = pd.to_datetime(user_input["Cycle Start Date"])
    predicted_date = cycle_start + pd.to_timedelta(pred_days, unit="D")

    return {
        "predicted_days_until_next_period": round(pred_days, 1),
        "predicted_next_cycle_start_date": str(predicted_date.date())
    }

sample_user = {
    "Age": 23,
    "BMI": 25,
    "Stress Level": 5,
    "Sleep Hours": 5,
    "Cycle Length": 31,
    "Period Length": 3,
    "Exercise Frequency": "moderate",
    "Diet": "balanced",
    "Symptoms": "bloating, mood swings",
    "Cycle Start Date": "2025-12-04"
}

print(predict_next_period_nn(sample_user))