In [None]:
# https://www.kaggle.com/datasets/shahriarkabir/procurement-kpi-analysis-dataset


# ============================
# 📦 Procurement KPI Dataset EDA
# ============================

# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# 2. Load the data
df = pd.read_csv('Procurement_KPI.csv')
print(f"Dataset shape: {df.shape}")
df.head()

# -----------------------------------------
# 🧠 Reasoning:
# - We check if the data loads correctly.
# - We see initial rows and size.
# -----------------------------------------

# 3. Basic Info and Data Types
df.info()

# -----------------------------------------
# 🧠 Reasoning:
# - We identify data types (numerical, categorical, datetime).
# - Important for later cleaning and modeling decisions.
# -----------------------------------------

# 4. Check for Missing Values
missing_values = df.isnull().sum().sort_values(ascending=False)
print(missing_values)

# -----------------------------------------
# 🧠 Reasoning:
# - Missing values can bias the model.
# - Strategy: If missing <5%, impute; if >30%, consider dropping.
# -----------------------------------------

# 5. Check for Duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Drop duplicates if any
df.drop_duplicates(inplace=True)

# -----------------------------------------
# 🧠 Reasoning:
# - Duplicate data can cause models to memorize and overfit.
# - Safe to drop unless specified otherwise.
# -----------------------------------------

# 6. Univariate Analysis
# a) Categorical Columns (Example: 'Category')
if 'Category' in df.columns:
    df['Category'].value_counts().plot(kind='bar', title='Category Distribution')
    plt.show()

# b) Numerical Columns
df.describe()

# Plot histograms for numerical features
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols].hist(figsize=(15, 10), bins=30)
plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - See distribution of numerical and categorical features.
# - Detect skewness, rare categories, extreme values.
# -----------------------------------------

# 7. Bivariate Analysis (Feature vs Target)

# Example: Cost Savings vs Contract Value
if 'Cost Savings' in df.columns and 'Contract Value' in df.columns:
    sns.scatterplot(x='Contract Value', y='Cost Savings', data=df)
    plt.title('Cost Savings vs Contract Value')
    plt.show()

# Grouped barplot example for Region (if available)
if 'Region' in df.columns:
    region_means = df.groupby('Region')['Cost Savings'].mean().sort_values()
    region_means.plot(kind='bar', title='Average Cost Savings by Region')
    plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - Understand relationships with target variable.
# - Detect important features visually.
# -----------------------------------------

# 8. Multivariate Analysis - Correlation

plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - Find highly correlated features.
# - Remove multicollinearity if correlation > 0.8 between two input features.
# -----------------------------------------

# 9. Feature Engineering

# Example: Create a new feature Savings Percentage
if 'Cost Savings' in df.columns and 'Contract Value' in df.columns:
    df['Savings Percentage'] = df['Cost Savings'] / df['Contract Value']

# Example: Group rare suppliers/categories if needed
if 'Supplier Name' in df.columns:
    supplier_counts = df['Supplier Name'].value_counts()
    rare_suppliers = supplier_counts[supplier_counts < 5].index
    df['Supplier Name'] = df['Supplier Name'].replace(rare_suppliers, 'Other')

# -----------------------------------------
# 🧠 Reasoning:
# - Sometimes new features are better predictors than raw ones.
# - Handling rare labels avoids data fragmentation.
# -----------------------------------------

# 10. Outlier Detection and Treatment

# Boxplot example
if 'Cost Savings' in df.columns:
    sns.boxplot(x=df['Cost Savings'])
    plt.title('Boxplot of Cost Savings')
    plt.show()

# IQR method to remove extreme outliers
Q1 = df['Cost Savings'].quantile(0.25)
Q3 = df['Cost Savings'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Optional: Filter out extreme values
df = df[(df['Cost Savings'] >= lower_bound) & (df['Cost Savings'] <= upper_bound)]

# -----------------------------------------
# 🧠 Reasoning:
# - Extreme outliers affect regression models heavily.
# - Removing/capping them improves model robustness.
# -----------------------------------------

# 11. Final Data Cleaning

# a) Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# b) Scale numerical variables (Optional for tree models; Required for KNN, SVM, etc.)
scaler = StandardScaler()
scaled_cols = ['Contract Value', 'PO Cycle Time']

for col in scaled_cols:
    if col in df.columns:
        df[col] = scaler.fit_transform(df[[col]])

# -----------------------------------------
# 🧠 Reasoning:
# - Encoding is needed to convert categorical to numerical.
# - Scaling required if distance matters (e.g., KNN, SVM).
# -----------------------------------------

# 12. Model Selection Recommendation

# If simple relationships (linear) ➔ Linear Regression
# If complex, non-linear relationships ➔ Random Forest or XGBoost

# -----------------------------------------
# 🧠 Reasoning:
# - Linear models are simple and interpretable.
# - Tree-based models handle messy, non-linear real-world data better.
# - XGBoost/LightGBM are optimized for structured data competitions.
# -----------------------------------------

print("✅ EDA Completed. Ready for Modeling!")


In [None]:
# https://www.kaggle.com/datasets/zahidmughal2343/global-cancer-patients-2015-2024


# ============================
# 📦 Global Cancer Patients Dataset EDA
# ============================

# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# 2. Load the data
df = pd.read_csv('Global_Cancer_Patients.csv')
print(f"Dataset shape: {df.shape}")
df.head()

# -----------------------------------------
# 🧠 Reasoning:
# - Check if the data loads correctly.
# - Understand initial size and get a feeling for the columns.
# -----------------------------------------

# 3. Basic Info and Data Types
df.info()

# -----------------------------------------
# 🧠 Reasoning:
# - Identify numerical, categorical, datetime columns.
# - Important for choosing correct cleaning methods.
# -----------------------------------------

# 4. Check for Missing Values
missing_values = df.isnull().sum().sort_values(ascending=False)
print(missing_values)

# -----------------------------------------
# 🧠 Reasoning:
# - Missing values can cause errors and bias.
# - Strategy: Impute if <5% missing; drop or engineer if >30% missing.
# -----------------------------------------

# 5. Check for Duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Drop duplicates if any
df.drop_duplicates(inplace=True)

# -----------------------------------------
# 🧠 Reasoning:
# - Duplicated patient records can distort model learning.
# -----------------------------------------

# 6. Univariate Analysis
# a) Categorical Features
cat_cols = df.select_dtypes(include='object').columns

for col in cat_cols:
    print(df[col].value_counts())
    df[col].value_counts().plot(kind='bar', title=f'{col} Distribution')
    plt.show()

# b) Numerical Features
df.describe()

# Histograms
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols].hist(figsize=(15, 10), bins=30)
plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - Analyze the spread, outliers, and distribution of individual features.
# - Important to detect imbalances and skewness.
# -----------------------------------------

# 7. Bivariate Analysis (Feature vs Target)

# Assuming target variable could be "Survival Status" (if exists) or another outcome.
# Let's check survival rate across Age groups or Gender.

if 'Survival Status' in df.columns and 'Age' in df.columns:
    sns.boxplot(x='Survival Status', y='Age', data=df)
    plt.title('Age distribution by Survival Status')
    plt.show()

if 'Survival Status' in df.columns and 'Gender' in df.columns:
    sns.countplot(x='Gender', hue='Survival Status', data=df)
    plt.title('Gender vs Survival Status')
    plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - See if age/gender affects survival rates.
# - Find potential predictors.
# -----------------------------------------

# 8. Multivariate Analysis - Correlation Matrix

plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - Identify multicollinearity (high correlation between input features).
# - Important to avoid redundant information.
# -----------------------------------------

# 9. Feature Engineering

# Example ideas:
# - Create Age bins (e.g., Child, Adult, Senior).
# - Combine rare cancer types into 'Other' group.
# - Calculate Treatment Delay (if treatment dates exist).

if 'Age' in df.columns:
    df['Age_Group'] = pd.cut(df['Age'], bins=[0, 18, 45, 65, 100], labels=['Child', 'Adult', 'Middle-Aged', 'Senior'])

# Group rare categories
for col in cat_cols:
    rare_labels = df[col].value_counts()[df[col].value_counts() < 5].index
    df[col] = df[col].replace(rare_labels, 'Other')

# -----------------------------------------
# 🧠 Reasoning:
# - Engineering meaningful groups can improve model performance.
# - Avoid rare category fragmentation.
# -----------------------------------------

# 10. Outlier Detection and Treatment

# Example: Check outliers in Age
if 'Age' in df.columns:
    sns.boxplot(x=df['Age'])
    plt.title('Boxplot of Age')
    plt.show()

# Remove extreme age values if needed
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]

# -----------------------------------------
# 🧠 Reasoning:
# - Remove unrealistic ages (negative ages, 150+ years old patients, etc.).
# -----------------------------------------

# 11. Final Data Cleaning

# a) Encode categorical variables
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# b) Scale numerical variables (Optional for Tree models; Required for KNN, SVM)
scale_cols = df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# -----------------------------------------
# 🧠 Reasoning:
# - Machine learning algorithms need clean numerical inputs.
# - Scaling ensures features are comparable.
# -----------------------------------------

# 12. Model Selection Recommendation

# If the target is survival (binary classification):
# - Simple data ➔ Logistic Regression
# - Complex patterns ➔ Random Forest, XGBoost

# If regression (e.g., predict Survival time in months):
# - Simple ➔ Linear Regression
# - Complex ➔ Random Forest Regressor, Gradient Boosting

# -----------------------------------------
# 🧠 Reasoning:
# - Choose model depending on problem type (classification or regression).
# - Start simple, then move to ensemble methods if performance is low.
# -----------------------------------------

print("✅ EDA Completed. Ready for Modeling!")


In [None]:
# ============================
# 📦 Machine Learning Modeling Template
# ============================

# 1. Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 2. Separate Features and Target

# Define your target variable (replace 'TARGET_COLUMN' with your actual target)
target = 'TARGET_COLUMN'

X = df.drop(target, axis=1)
y = df[target]

# -----------------------------------------
# 🧠 Reasoning:
# - Features (X) are all columns except target.
# - Target (y) is what we want to predict.
# -----------------------------------------

# 3. Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y if y.nunique() <= 10 else None
)

# -----------------------------------------
# 🧠 Reasoning:
# - 80/20 split is a standard starting point.
# - Use stratify for classification to maintain class balance.
# -----------------------------------------

# 4. Model Selection Based on Problem Type

# Check if it’s classification or regression
if y.nunique() <= 10:
    problem_type = 'classification'
else:
    problem_type = 'regression'

print(f"Detected Problem Type: {problem_type}")

# -----------------------------------------
# 🧠 Reasoning:
# - If target has few unique values, it's likely classification.
# - Otherwise, regression.
# -----------------------------------------

# 5. Model Training

if problem_type == 'classification':
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Optional: ROC AUC if binary classification
    if y.nunique() == 2:
        y_proba = model.predict_proba(X_test)[:, 1]
        roc = roc_auc_score(y_test, y_proba)
        print(f"ROC-AUC Score: {roc:.4f}")

else:
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluation Metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")

# -----------------------------------------
# 🧠 Reasoning:
# - Use different metrics for classification (Accuracy, Precision, Recall, F1, ROC-AUC) vs regression (MAE, RMSE, R2).
# - Random Forests are robust baseline models: they handle non-linearity and feature importance well.
# -----------------------------------------

# 6. Feature Importance (Optional)

if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
    feature_names = X.columns
    feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

    # Plot Feature Importance
    feat_imp.plot(kind='bar', figsize=(12,6), title='Feature Importances')
    plt.show()

# -----------------------------------------
# 🧠 Reasoning:
# - Understanding which features matter most helps explain model decisions.
# - Especially important in interviews and production deployments.
# -----------------------------------------

# ✅ Modeling phase completed!


In [None]:
# 📄 One-Page EDA + Modeling Cheat Sheet

# 1. Understand the Problem
- What is the business goal?
- Identify Target Variable.
- Is it Classification (categories) or Regression (continuous numbers)?
🧠 Reasoning: Solve the right problem.

# 2. Load the Data
```python
import pandas as pd
df = pd.read_csv('file.csv')
df.head(), df.shape
```
🧠 Reasoning: Confirm data loads and understand initial size.

# 3. Data Types Check
```python
df.info()
```
🧠 Reasoning: Know which columns are numerical, categorical, or datetime.

# 4. Missing Values
```python
df.isnull().sum()
```
- <5% missing → Impute (mean/median/mode)
- >30% missing → Drop or Engineer.
🧠 Reasoning: Handle missingness early to avoid bias.

# 5. Duplicates
```python
df.duplicated().sum()
df.drop_duplicates(inplace=True)
```
🧠 Reasoning: Prevent model from overfitting repeated samples.

# 6. Univariate Analysis
- Numerical: `.describe()`, Histograms
- Categorical: `.value_counts()`, Bar Plots
```python
import matplotlib.pyplot as plt
df.select_dtypes(include=[int, float]).hist(figsize=(15, 10), bins=30)
plt.show()
```
🧠 Reasoning: Understand feature distributions and spot outliers.

# 7. Bivariate Analysis (Feature vs Target)
- Numeric Target ➔ Scatter plots, Correlations
- Categorical Target ➔ Boxplots, GroupBy Mean
```python
import seaborn as sns
sns.scatterplot(x='Feature', y='Target', data=df)
```
🧠 Reasoning: Find strong predictors.

# 8. Multivariate Analysis
```python
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
```
🧠 Reasoning: Detect multicollinearity (r > 0.8).

# 9. Feature Engineering
- Create ratios, bins, new features.
- Group rare labels into "Other".
```python
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 18, 45, 65, 100], labels=['Child', 'Adult', 'Middle-Aged', 'Senior'])
```
🧠 Reasoning: Strong features improve models.

# 10. Outlier Handling
- Boxplots
- IQR Method
```python
Q1 = df['col'].quantile(0.25)
Q3 = df['col'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['col'] >= Q1-1.5*IQR) & (df['col'] <= Q3+1.5*IQR)]
```
🧠 Reasoning: Remove extreme values that skew models.

# 11. Final Data Prep
- Encode categorical variables
- Scale numerical variables if needed (for KNN, SVM)
```python
from sklearn.preprocessing import StandardScaler
df = pd.get_dummies(df, drop_first=True)
scaler = StandardScaler()
num_cols = df.select_dtypes(include=[float, int]).columns
df[num_cols] = scaler.fit_transform(df[num_cols])
```
🧠 Reasoning: Models require clean numeric input.

# 12. Train-Test Split
```python
from sklearn.model_selection import train_test_split
X = df.drop('TARGET', axis=1)
y = df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y if y.nunique()<=10 else None, random_state=42)
```
🧠 Reasoning: Holdout data ensures unbiased evaluation.

# 13. Choose Model
| Problem Type | Suggested Model |
|:-------------|:----------------|
| Classification | Random Forest, XGBoost, Logistic Regression |
| Regression | Random Forest Regressor, XGBoost Regressor, Linear Regression |

🧠 Reasoning: Tree models handle messy real-world data, linear for simple data.

# 14. Model Training & Evaluation
```python
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Detect Problem Type
problem_type = 'classification' if y.nunique()<=10 else 'regression'

# Train Model
if problem_type == 'classification':
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
    print("Precision:", precision_score(y_test, y_pred, average='weighted'))
    print("Recall:", recall_score(y_test, y_pred, average='weighted'))
    if y.nunique() == 2:
        print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
else:
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2:", r2_score(y_test, y_pred))
```
🧠 Reasoning: Use metrics suitable for problem type — accuracy, F1 for classification; MAE, RMSE, R2 for regression.

# 15. Feature Importance (optional)
```python
importances = model.feature_importances_
feature_names = X.columns
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
feat_imp.plot(kind='bar', figsize=(12,6), title='Feature Importances')
plt.show()
```
🧠 Reasoning: Identify key drivers behind predictions.

---

# 📋 Visual Workflow
```
Understand → Load → Clean → Explore → Engineer → Prepare → Model → Evaluate
```

# 🛡️ Interview Speaking Tip
"When explaining steps, always mention *why* you did something — not just *what* you did. Showing decision-making ability is more important than just technical execution."

✅
