In [64]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [65]:
plt.style.use("default")

In [66]:
def yes_no_to_bin(df: pd.DataFrame, colnames: list[str]):
    for col in colnames:
        df[col] = (df[col] == "YES").astype(int)

def n_from_10_to_n(df: pd.DataFrame, colnames: list[str]):
    """
    Converts "3 From 10" -> 3 (int)
    """
    for col in colnames:
        df[col] =  (df[col].str.replace(" From 10", "")).astype(int)

def to_dummies(df: pd.DataFrame, colnames: list[str]) -> pd.DataFrame:
    for col in colnames:
        dms = pd.get_dummies(df[col], prefix=col).astype(int)
        # dms.drop(columns=f"{col}_Seldom", inplace=True)
        df = pd.concat([df, dms], axis=1)

    df.drop(columns=colnames, inplace=True)
    return df

def seldom_often_to_1_4(df: pd.DataFrame, colnames: list[str]):
    for col in colnames:
        df[col] = df[col].map({
            "Seldom": 1,
            "Sometimes": 2,
            "Usually": 3,
            "Most-Often": 4,
        })

In [68]:
path = kagglehub.dataset_download("mdsultanulislamovi/mental-disorders-dataset")
print(path)
dataset = pd.read_csv(path + "/mental_disorders_dataset.csv")

n_from_10_to_n(dataset, [
    "Sexual Activity",
    "Concentration",
    "Optimisim",
])

yes_no_to_bin(dataset, [
    "Mood Swing",
    "Suicidal thoughts",
    "Anorxia",
    "Authority Respect",
    "Try-Explanation",
    "Aggressive Response",
    "Ignore & Move-On",
    "Nervous Break-down",
    "Admit Mistakes",
    "Overthinking",
])

seldom_often_to_1_4(dataset, [
    "Sadness",
    "Euphoric",
    "Exhausted",
    "Sleep dissorder"
])

# dataset_to_save = to_dummies(dataset, [
#     "Sadness",
#     "Euphoric",
#     "Exhausted",
#     "Sleep dissorder",
# ])


train_dataset, test_dataset = train_test_split(dataset, test_size=0.25, random_state=42)
train_dataset.to_csv("dataset/train.csv", index=False)
test_dataset.to_csv("dataset/test.csv", index=False)

X = dataset.drop(columns=["Expert Diagnose", "Patient Number"])

dataset.info()

/home/dom/.cache/kagglehub/datasets/mdsultanulislamovi/mental-disorders-dataset/versions/1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Patient Number       120 non-null    object
 1   Sadness              120 non-null    int64 
 2   Euphoric             120 non-null    int64 
 3   Exhausted            120 non-null    int64 
 4   Sleep dissorder      120 non-null    int64 
 5   Mood Swing           120 non-null    int64 
 6   Suicidal thoughts    120 non-null    int64 
 7   Anorxia              120 non-null    int64 
 8   Authority Respect    120 non-null    int64 
 9   Try-Explanation      120 non-null    int64 
 10  Aggressive Response  120 non-null    int64 
 11  Ignore & Move-On     120 non-null    int64 
 12  Nervous Break-down   120 non-null    int64 
 13  Admit Mistakes       120 non-null    int64 
 14  Overthinking   

In [None]:
dataset.head()

In [None]:
# plt.hist(dataset["Expert Diagnose"].va)
order = ["Normal", "Depression", "Bipolar Type-1", "Bipolar Type-2"]
dataset["Expert Diagnose"].value_counts().loc[order].plot.bar()

In [None]:
_, axs = plt.subplots(2, 2, figsize=(7, 7))
axs = axs.flatten()

order = [1, 2, 3, 4]

dataset["Sadness"].value_counts().loc[order].plot.bar(ax=axs[0])
dataset["Euphoric"].value_counts().loc[order].plot.bar(ax=axs[1])
dataset["Exhausted"].value_counts().loc[order].plot.bar(ax=axs[2])
dataset["Sleep dissorder"].value_counts().loc[order].plot.bar(ax=axs[3])

plt.tight_layout()
plt.show()

In [None]:
_, axs = plt.subplots(1, 3, figsize=(12, 4))
axs = axs.flatten()

# "Sexual Activity",
# "Concentration",
# "Optimisim",
counts = dataset["Sexual Activity"].value_counts()
axs[0].bar(counts.index, counts.values)
axs[0].set_title("Sexual Activity")

counts = dataset["Concentration"].value_counts()
axs[1].bar(counts.index, counts.values)
axs[1].set_title("Concentration")

counts = dataset["Optimisim"].value_counts()
axs[2].bar(counts.index, counts.values)
axs[2].set_title("Optimisim")

plt.show()

In [None]:
def calculate_vif(df: pd.DataFrame):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns

    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    print(vif_data)

In [None]:
calculate_vif(X)

In [None]:
calculate_vif(X.drop(columns=[
    "Sexual Activity",
]))

In [None]:
calculate_vif(X.drop(columns=[
    "Sexual Activity",
    "Exhausted",
]))

In [None]:
plt.matshow(X.corr())
plt.colorbar()

plt.matshow(X.corr().abs() >= 0.4)
plt.show()

In [None]:
pd.plotting.scatter_matrix(X, figsize=(12, 12))