In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("Student Mental health.csv")
df.head()

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


In [4]:
# Cek informasi umum dataset
df.info()

# Lihat 5 data pertama
df.head()

# Statistik deskriptif untuk kolom numerik
df.describe()

# Jumlah missing value tiap kolom
print("\nMissing Value per Kolom:")
print(df.isnull().sum())

# Jumlah data duplikat
print("\nJumlah data duplikat:", df.duplicated().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Timestamp                                     101 non-null    object 
 1   Choose your gender                            101 non-null    object 
 2   Age                                           100 non-null    float64
 3   What is your course?                          101 non-null    object 
 4   Your current year of Study                    101 non-null    object 
 5   What is your CGPA?                            101 non-null    object 
 6   Marital status                                101 non-null    object 
 7   Do you have Depression?                       101 non-null    object 
 8   Do you have Anxiety?                          101 non-null    object 
 9   Do you have Panic attack?                     101 non-null    obj

In [None]:
# ===========================================================
# Langkah 1: Rename kolom biar sesuai laporan dan ringkas
# ===========================================================
df.rename(columns={
    'Choose your gender': 'Gender',
    'What is your course?': 'Course',
    'Your current year of Study': 'Year of Study',
    'What is your CGPA?': 'CGPA',
    'Do you have Depression?': 'Depression',
    'Do you have Anxiety?': 'Anxiety',
    'Do you have Panic attack?': 'Panic attack',
    'Did you seek any specialist for a treatment?': 'Seek help'
}, inplace=True)

# ===========================================================
# Langkah 2: Bersihin teks angka & ubah tipe data
# ===========================================================
# Bersihin koma jadi titik (3,5 ‚Üí 3.5)
df['Age'] = df['Age'].astype(str).str.replace(',', '.')
df['CGPA'] = df['CGPA'].astype(str).str.replace(',', '.')

# Ubah ke numerik, invalid jadi NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['CGPA'] = pd.to_numeric(df['CGPA'], errors='coerce')

# ===========================================================
# Langkah 3: Hapus duplikat
# ===========================================================
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(f"üßπ Jumlah data duplikat yang dihapus: {before - after}")

# ===========================================================
# Langkah 4: Isi missing value (versi aman)
# ===========================================================
num_cols = ['Age', 'CGPA']
cat_cols = ['Gender', 'Course', 'Year of Study', 'Marital status',
            'Anxiety', 'Panic attack', 'Seek help']

for col in num_cols:
    median_val = df[col].median()
    if pd.notna(median_val):  # hanya isi kalau median valid
        df[col] = df[col].fillna(median_val)
        print(f"‚û°Ô∏è Nilai kosong di kolom {col} diisi median: {median_val}")
    else:
        print(f"‚ö†Ô∏è Kolom {col} median tidak valid (semua NaN), dilewati")

for col in cat_cols:
    mode_series = df[col].mode()
    if not mode_series.empty:  # hanya isi kalau ada modus
        mode_val = mode_series[0]
        df[col] = df[col].fillna(mode_val)
        print(f"‚û°Ô∏è Nilai kosong di kolom {col} diisi modus: {mode_val}")
    else:
        print(f"‚ö†Ô∏è Kolom {col} tidak punya modus valid, dilewati")

# ===========================================================
# Langkah 5: Deteksi & hapus outlier (IQR)
# ===========================================================
import numpy as np

for col in num_cols:
    if df[col].notna().sum() == 0:
        print(f"‚ö†Ô∏è Kolom {col} kosong semua, dilewati saat cek outlier")
        continue
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f"‚ö†Ô∏è Kolom {col}: {len(outliers)} outlier ditemukan")

    # Hapus outlier ekstrem
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# ===========================================================
# Langkah 6: Cek hasil akhir
# ===========================================================
print("\n‚úÖ Jumlah data akhir:", df.shape[0])
print("\nüîç Missing value terakhir:")
print(df.isnull().sum())


‚úÖ Kolom setelah rename:
['Timestamp', 'Gender', 'Age', 'Course', 'Year of Study', 'CGPA', 'Marital status', 'Depression', 'Anxiety', 'Panic attack', 'Seek help']

üìä Tipe data setelah konversi:
Timestamp          object
Gender             object
Age               float64
Course             object
Year of Study      object
CGPA              float64
Marital status     object
Depression         object
Anxiety            object
Panic attack       object
Seek help          object
dtype: object

üßπ Jumlah data duplikat yang dihapus: 0
‚û°Ô∏è Nilai kosong di kolom Age diisi median: nan
‚û°Ô∏è Nilai kosong di kolom CGPA diisi median: nan


KeyError: 0

In [6]:
print(df.columns.tolist())


['Timestamp', 'Choose your gender', 'Age', 'What is your course?', 'Your current year of Study', 'What is your CGPA?', 'Marital status', 'Do you have Depression?', 'Do you have Anxiety?', 'Do you have Panic attack?', 'Did you seek any specialist for a treatment?']
