In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Display settings for pandas and matplotlib
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('ggplot')

In [None]:
print("Libraries imported and environment configured.")

In [None]:
# Load the dataset from the correct sheet
pcos_dataset_filename = 'PCOS_data_without_infertility.xlsx'
sheet_index = 1  # Sheet 2 (Python uses 0-based indexing)

In [None]:
try:
    df = pd.read_excel(pcos_dataset_filename, sheet_name=sheet_index)
    print(f"Dataset '{pcos_dataset_filename}' (Sheet {sheet_index + 1}) loaded successfully.")
except FileNotFoundError:
    print(f"Error: '{pcos_dataset_filename}' not found. Please ensure the Excel file is in the same directory.")
    exit()  # Stop execution if file not found
except ValueError as e:
    print(f"Error loading sheet {sheet_index + 1} from '{pcos_dataset_filename}': {e}")
    exit()  # Stop execution if sheet not found

In [None]:
# Inspect the data
print("\n--- First 5 rows of the dataset ---")
print(df.head())

In [None]:
print("\n--- Dataset Information (Columns, Non-Null Counts, Data Types) ---")
df.info()

In [None]:
print("\n--- Basic Statistical Summary of Numerical Columns ---")
print(df.describe())

In [None]:
# Check the distribution of the target variable
# IMPORTANT: Adjust 'PCOS' if your target column has a different exact name (case-sensitive)
target_column_name = 'PCOS'
if target_column_name in df.columns:
    print(df[target_column_name].value_counts())
    print(f"Percentage of PCOS cases (1) in the dataset: {df[target_column_name].value_counts(normalize=True).get(1, 0) * 100:.2f}%")
else:
    print(f"Target column '{target_column_name}' not found. Available columns: {df.columns.tolist()}")
    exit()  # Stop execution if target not found

In [None]:
# Data inspection for cleaning and preprocessing
print("\n--- Columns and Data Types ---")
print(df.dtypes)

In [None]:
print("\n--- Number of Missing Values per Column ---")
print(df.isnull().sum())

In [None]:
print("\n--- Columns with Object (Non-Numeric) Data Types ---")
object_cols = df.select_dtypes(include=['object']).columns
print(object_cols)

In [None]:
print("\n--- Unique Values in Categorical Columns ---")
for col in object_cols:
    print(f"\nColumn: {col}")
    print(df[col].unique())