In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [35]:
file_path = "dataset/schizophrenia_dataset.csv"  
df = pd.read_csv(file_path)

**1. Apa saja Fitur dataset dan tipe data masing-masing**

In [36]:
print("Informasi Dataset:")
print(df.info())

Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Patient_ID              10000 non-null  int64
 1   Age                     10000 non-null  int64
 2   Gender                  10000 non-null  int64
 3   Education_Level         10000 non-null  int64
 4   Marital_Status          10000 non-null  int64
 5   Occupation              10000 non-null  int64
 6   Income_Level            10000 non-null  int64
 7   Living_Area             10000 non-null  int64
 8   Diagnosis               10000 non-null  int64
 9   Disease_Duration        10000 non-null  int64
 10  Hospitalizations        10000 non-null  int64
 11  Family_History          10000 non-null  int64
 12  Substance_Use           10000 non-null  int64
 13  Suicide_Attempt         10000 non-null  int64
 14  Positive_Symptom_Score  10000 non-null  int64
 15  N

In [37]:
print("\nPreview Dataset:")
print(df.head())


Preview Dataset:
   Patient_ID  Age  Gender  Education_Level  Marital_Status  Occupation  \
0           1   72       1                4               2           0   
1           2   49       1                5               2           2   
2           3   53       1                5               3           2   
3           4   67       1                3               2           0   
4           5   54       0                1               2           0   

   Income_Level  Living_Area  Diagnosis  Disease_Duration  Hospitalizations  \
0             2            1          0                 0                 0   
1             1            0          1                35                 1   
2             1            0          1                32                 0   
3             2            0          0                 0                 0   
4             2            1          0                 0                 0   

   Family_History  Substance_Use  Suicide_Attempt  Posit

**2. Menentukan Fitur target (jika ada) dan fitur variabel bebas**

In [38]:
#cek kolom yg mengandung kata diagnosis atau label
possible_targets = [col for col in df.columns if 'diagnosis' in col.lower() or 'label' in col.lower()] 
# jika kolom mengandung kata diagnosis atau label, maka dijadikan target atau -> masuk dalam list possible_targets
feature_target = possible_targets[0] if possible_targets else None #jika ada maka diambil indeks 0, jika tidak ada maka None
# jika kolom tidak mengandung diagnosis atau label maka masuk ke dalam list feature_variables
feature_variables = [col for col in df.columns if col != feature_target]

print("\nFitur Target:", feature_target)
print("Fitur Variabel Bebas:", feature_variables)


Fitur Target: Diagnosis
Fitur Variabel Bebas: ['Patient_ID', 'Age', 'Gender', 'Education_Level', 'Marital_Status', 'Occupation', 'Income_Level', 'Living_Area', 'Disease_Duration', 'Hospitalizations', 'Family_History', 'Substance_Use', 'Suicide_Attempt', 'Positive_Symptom_Score', 'Negative_Symptom_Score', 'GAF_Score', 'Social_Support', 'Stress_Factors', 'Medication_Adherence']


**3. Rentang nilai setiap fitur**

In [39]:
print("\nRentang Nilai Fitur:")
print(df.describe())


Rentang Nilai Fitur:
        Patient_ID           Age        Gender  Education_Level  \
count  10000.00000  10000.000000  10000.000000     10000.000000   
mean    5000.50000     48.867700      0.502600         3.042600   
std     2886.89568     18.215054      0.500018         1.423021   
min        1.00000     18.000000      0.000000         1.000000   
25%     2500.75000     33.000000      0.000000         2.000000   
50%     5000.50000     49.000000      1.000000         3.000000   
75%     7500.25000     65.000000      1.000000         4.000000   
max    10000.00000     80.000000      1.000000         5.000000   

       Marital_Status    Occupation  Income_Level   Living_Area     Diagnosis  \
count    10000.000000  10000.000000  10000.000000  10000.000000  10000.000000   
mean         1.518200      1.510600      0.996900      0.499400      0.288700   
std          1.117315      1.114097      0.817531      0.500025      0.453181   
min          0.000000      0.000000      0.000000 

**4. Ada tidaknya data null**

In [40]:
# cek data null
print("\nJumlah Data Null per Fitur:")
print(df.isnull().sum()) #menghitung jumlah data null per fitur


Jumlah Data Null per Fitur:
Patient_ID                0
Age                       0
Gender                    0
Education_Level           0
Marital_Status            0
Occupation                0
Income_Level              0
Living_Area               0
Diagnosis                 0
Disease_Duration          0
Hospitalizations          0
Family_History            0
Substance_Use             0
Suicide_Attempt           0
Positive_Symptom_Score    0
Negative_Symptom_Score    0
GAF_Score                 0
Social_Support            0
Stress_Factors            0
Medication_Adherence      0
dtype: int64


**5. Ada tidaknya data yang tidak konsisten (duplikat)**

In [41]:
# cek data duplikat
print("\nJumlah Data Duplikat:", df.duplicated().sum())


Jumlah Data Duplikat: 0


**6. Perlu tidaknya transformasi data**

In [42]:
# cek jika data bertipe objek, maka dimasukan ke dalam list categorical_features
categorical_features = [col for col in df.columns if df[col].dtype == 'object']
# cek jika data bertipe int64 atau float64, maka dimasukan ke dalam list numerical_features
numerical_features = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

print("\nFitur Kategorikal:", categorical_features)
print("Fitur Numerik:", numerical_features)


Fitur Kategorikal: []
Fitur Numerik: ['Patient_ID', 'Age', 'Gender', 'Education_Level', 'Marital_Status', 'Occupation', 'Income_Level', 'Living_Area', 'Diagnosis', 'Disease_Duration', 'Hospitalizations', 'Family_History', 'Substance_Use', 'Suicide_Attempt', 'Positive_Symptom_Score', 'Negative_Symptom_Score', 'GAF_Score', 'Social_Support', 'Stress_Factors', 'Medication_Adherence']


**7. Perlu tidaknya normalisasi data**

In [48]:
# variabel menampung normalisasi data
scaler = StandardScaler()
# menyalin data
df_normalized = df.copy()
# menentukan skala normalisasi dan ubah data dalam rentang 0-1
df_normalized[numerical_features] = scaler.fit_transform(df[numerical_features])

print("\nData setelah Normalisasi (contoh 5 baris pertama):")
print(df_normalized.head())


Data setelah Normalisasi (contoh 5 baris pertama):
   Patient_ID       Age    Gender  Education_Level  Marital_Status  \
0   -1.731878  1.270019  0.994813         0.672828        0.431234   
1   -1.731531  0.007264  0.994813         1.375593        0.431234   
2   -1.731185  0.226873  0.994813         1.375593        1.326281   
3   -1.730838  0.995507  0.994813        -0.029938        0.431234   
4   -1.730492  0.281776 -1.005214        -1.435469        0.431234   

   Occupation  Income_Level  Living_Area  Diagnosis  Disease_Duration  \
0   -1.355964      1.227048     1.001201  -0.637084         -0.529447   
1    0.439302      0.003792    -0.998801   1.569651          2.633320   
2    0.439302      0.003792    -0.998801   1.569651          2.362225   
3   -1.355964      1.227048    -0.998801  -0.637084         -0.529447   
4   -1.355964      1.227048     1.001201  -0.637084         -0.529447   

   Hospitalizations  Family_History  Substance_Use  Suicide_Attempt  \
0         -0.5100

In [51]:
print(numerical_features.describe())

        Patient_ID           Age        Gender  Education_Level  \
count  10000.00000  10000.000000  10000.000000     10000.000000   
mean    5000.50000     48.867700      0.502600         3.042600   
std     2886.89568     18.215054      0.500018         1.423021   
min        1.00000     18.000000      0.000000         1.000000   
25%     2500.75000     33.000000      0.000000         2.000000   
50%     5000.50000     49.000000      1.000000         3.000000   
75%     7500.25000     65.000000      1.000000         4.000000   
max    10000.00000     80.000000      1.000000         5.000000   

       Marital_Status    Occupation  Income_Level   Living_Area     Diagnosis  \
count    10000.000000  10000.000000  10000.000000  10000.000000  10000.000000   
mean         1.518200      1.510600      0.996900      0.499400      0.288700   
std          1.117315      1.114097      0.817531      0.500025      0.453181   
min          0.000000      0.000000      0.000000      0.000000      0.0

**8. Apakah ada outlier (pencilan)**

In [44]:
# hitung kuartil 1 dari data numerik
Q1 = df[numerical_features].quantile(0.25)
# hitung kuartil 3 dari data numerik
Q3 = df[numerical_features].quantile(0.75)
# hitung Interquartile Range (IQR)
IQR = Q3 - Q1
# hitung jumlah outlier per fitur 
outliers = ((df[numerical_features] < (Q1 - 1.5 * IQR)) | #jika nilai lbh rendah dari batas bawah(outlier rendah)
            (df[numerical_features] > (Q3 + 1.5 * IQR))) #jika nilai lbh besar dari batas atas(outlier tinggi)

print("\nJumlah Outlier per Fitur:")
print(outliers.sum()) # total outlier per fitur


Jumlah Outlier per Fitur:
Patient_ID                   0
Age                          0
Gender                       0
Education_Level              0
Marital_Status               0
Occupation                   0
Income_Level                 0
Living_Area                  0
Diagnosis                    0
Disease_Duration          1963
Hospitalizations          2100
Family_History               0
Substance_Use             2177
Suicide_Attempt            884
Positive_Symptom_Score       0
Negative_Symptom_Score       0
GAF_Score                    0
Social_Support               0
Stress_Factors               0
Medication_Adherence         0
dtype: int64


**9. Jika menggunakan data tidak terstruktur jelaskan teknik ekstraksi fitur yang digunakan.**

tidak ada teknik ekstraksi fitur karna tidak ada data teks yang digunakan