In [15]:
# import manipulation lybraries
import pandas as pd
import numpy as np 
# import visualization lybraries
import seaborn as sns
import matplotlib.pyplot as plt

#import machine learning lybraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

import pickle
from flaml import AutoML

In [16]:
import pandas as pd

# data ingestion

def data_ingestion():
    df = pd.read_csv(r'C:\Mental_Health_Prediction_Model\data\raw\survey.csv')
    return df

In [17]:
df = data_ingestion()

In [18]:
# check null values
df.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64

In [19]:
from collections import OrderedDict
# data exploration

def data_exploration(df):

    # select numerical columns 

    numerical_col = df.select_dtypes(exclude='object').columns
    stats = []
    for i in numerical_col:
        Q1 = df[i].quantile(0.25)
        Q3 = df[i].quantile(0.75)
        IQR = Q3 - Q1
        LW = Q1 - 1.5 * IQR
        UW = Q3 + 1.5 * IQR
        outlier_flag = "Has Outliers" if df[(df[i] < LW) | (df[i] > UW)].shape[0] > 0 else "No Outliers"

        numerical_stats = OrderedDict({
            "Feature": i,
            "Minimum": df[i].min(),
            "Maximum": df[i].max(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Mode": df[i].mode().iloc[0] if not df[i].mode().empty else np.nan,
            "25%": Q1,
            "75%": Q3,
            "IQR": IQR,
            "Standard Deviation": df[i].std(),
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurt(),
            "Outlier Comment": outlier_flag
        })
        stats.append(numerical_stats)
        numerical_stats_report = pd.DataFrame(stats)
    return  numerical_stats_report   



In [20]:
# def data preprocessing 

def data_preprocessing(df):
    le = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = le.fit_transform(df[col])

    return df
 


In [21]:
def model_building(df):

    X = df.drop("treatment", axis=1)
    y = df["treatment"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    automl = AutoML()

    settings = {
        "time_budget": 60,  # search time in seconds
        "metric": "accuracy",
        "task": "classification",
        "log_file_name": "flaml.log",
        "seed": 42
    }

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    print("\nBest Model Selected by FLAML:", automl.best_estimator)
    print("Best Hyperparameters:", automl.best_config)

    return automl, scaler, X_test, y_test



In [22]:
def model_evaluation(automl, X_test, y_test):

    y_pred = automl.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return automl

In [23]:
def save_model(automl, scaler):

    pickle.dump(automl, open("best_model.pkl", "wb"))
    pickle.dump(scaler, open("scaler.pkl", "wb"))

    print("\nBest FLAML Model Saved Successfully")

In [24]:
def main():

    filepath = "survey.csv"

    df = data_ingestion(filepath)

    # Perform data exploration, but don't reassign df
    exploration_report = data_exploration(df.copy()) # Use a copy for exploration
    print("\nData Exploration Report:\n", exploration_report)

    # Drop unnecessary columns explicitly in the main function's df
    df.drop(columns=['Country','state','Timestamp','comments'], axis=1, inplace=True)

    df = data_preprocessing(df)

    automl, scaler, X_test, y_test = model_building(df)

    automl = model_evaluation(automl, X_test, y_test)

    save_model(automl, scaler)
if __name__ == "__main__":
    main()

TypeError: data_ingestion() takes 0 positional arguments but 1 was given