In [1]:
import pandas as pd

# Load the Titanic CSV file you just uploaded
df = pd.read_csv('train.csv')

# Show the first 5 rows
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
# Show summary of missing values in each column
print(df.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [3]:
# Fill Age missing values with the median
df["Age"].fillna(df["Age"].median(), inplace=True)

# Drop Cabin column
df.drop("Cabin", axis=1, inplace=True)

# Fill Embarked missing values with the mode
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

# Confirm no more missing values
print(df.isnull().sum())


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


In [4]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Encode 'Sex' column
df["Sex"] = le.fit_transform(df["Sex"])

# Encode 'Embarked' column
df["Embarked"] = le.fit_transform(df["Embarked"])

# Display first 5 rows to check
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    1  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1      0   
2                             Heikkinen, Miss. Laina    0  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1      0   
4                           Allen, Mr. William Henry    1  35.0      0      0   

             Ticket     Fare  Embarked  
0         A/5 21171   7.2500         2  
1          PC 17599  71.2833         0  
2  STON/O2. 3101282   7.9250         2  
3            113803  53.1000         2  
4            373450   8.0500         2  


In [5]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Scale 'Age' and 'Fare' columns
df[["Age", "Fare"]] = scaler.fit_transform(df[["Age", "Fare"]])

# Display first 5 rows to check scaling
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex       Age  SibSp  \
0                            Braund, Mr. Owen Harris    1 -0.565736      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  0.663861      1   
2                             Heikkinen, Miss. Laina    0 -0.258337      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  0.433312      1   
4                           Allen, Mr. William Henry    1  0.433312      0   

   Parch            Ticket      Fare  Embarked  
0      0         A/5 21171 -0.502445         2  
1      0          PC 17599  0.786845         0  
2      0  STON/O2. 3101282 -0.488854         2  
3      0            113803  0.420730         2  
4      0            373450 -0.486337         2  


In [6]:
# Add IsMinor column (1 if Age < 18, else 0)
df["IsMinor"] = (df["Age"] < 0).astype(int)

# Display first 5 rows to verify
print(df[["Age", "IsMinor"]].head())


        Age  IsMinor
0 -0.565736        1
1  0.663861        0
2 -0.258337        1
3  0.433312        0
4  0.433312        0


In [7]:
# Save the cleaned and transformed dataset to a CSV file
df.to_csv("processed_titanic.csv", index=False)

print("Processed dataset saved successfully!")


Processed dataset saved successfully!


In [8]:
readme_text = """
# Task 1 – ETL Pipeline (Titanic Dataset)

## Description
This project performs an ETL (Extract, Transform, Load) pipeline on the Titanic dataset from Kaggle.

## Steps Performed
- **Extract:** Loaded the dataset from local Kaggle file
- **Transform:**
  - Handled missing values (Age, Cabin, Embarked)
  - Encoded categorical variables (Sex, Embarked)
  - Scaled numerical features (Age, Fare)
  - Added a derived feature (`IsMinor`)
- **Load:** Saved the processed dataset to `processed_titanic.csv`

## Tools Used
- Python
- Pandas
- Scikit-learn

## Output
The file `processed_titanic.csv` contains the final clean dataset ready for modeling.
"""

# Write README.md file
with open("README.md", "w") as file:
    file.write(readme_text)

print("README.md file created successfully!")


README.md file created successfully!
