In [8]:
# ============================================================
# Preprocessing Data - Titanic Dataset
# File: preprocessing.ipynb
# ============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

#1️ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#2️ Load dataset Titanic dari folder di Google Drive
df = pd.read_csv('/content/drive/MyDrive/DataMining-Tugas3/Tugas3/Titanic-Dataset.csv')

print("=== 5 Data Teratas ===")
print(df.head())

#3️ Cek struktur dan missing values
print("\n=== Info Dataset ===")
print(df.info())

print("\n=== Missing Values ===")
print(df.isnull().sum())

#4️ Pilih kolom penting saja (ubah jika nama kolom berbeda di file)
selected_columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
df = df[selected_columns]

#5️ Tangani missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

#6️ Encoding kolom kategorikal
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])            # male=1, female=0
df['Embarked'] = le.fit_transform(df['Embarked'])  # C,Q,S → 0,1,2

#7️ Normalisasi kolom numerik
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

#8️ Pisahkan fitur (X) dan target (y)
X = df.drop('Survived', axis=1)
y = df['Survived']

#9️ Split data train dan test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#10 Tampilkan hasil akhir
print("\n=== Data Setelah Preprocessing ===")
print(df.head())

print("\n=== Data Train Shape ===", X_train.shape)
print("=== Data Test Shape ===", X_test.shape)

# ============================================================
# Upload "preprocessing.ipynb" dari Google Colab ke GitHub
# ============================================================

import os
from getpass import getpass

# 1️⃣ Masukkan token GitHub
token = getpass('Masukkan GitHub Token (PAT): ')

# 2️⃣ Bersihkan folder lama agar tidak dobel
!rm -rf /content/Preprosessing_TitanicDataset

# 3️⃣ Konfigurasi Git
!git config --global user.email "aldiosebastian9@gmail.com"
!git config --global user.name "Aldiosebastiaan"

# 4️⃣ Clone repo
os.environ['GITHUB_TOKEN'] = token
os.environ['GITHUB_USER'] = "Aldiosebastiaan"
os.environ['GITHUB_REPO'] = "Preprosessing_TitanicDataset"

!git clone https://{os.environ['GITHUB_TOKEN']}@github.com/{os.environ['GITHUB_USER']}/{os.environ['GITHUB_REPO']}.git
%cd /content/Preprosessing_TitanicDataset

# 5️⃣ Cek isi folder /content untuk memastikan file ada
!ls /content

# 6️⃣ Copy file notebook (pastikan memang ada di /content)
!cp "/content/preprocessing.ipynb" .

# 7️⃣ Commit dan push pertama kali
!git add .
!git commit -m "Upload preprocessing.ipynb dari Google Colab"
!git branch -M main
!git push -u origin main





The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
=== 5 Data Teratas ===
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0 