In [1]:
import pandas as pd

# Load the cleaned data
df = pd.read_csv("Titanic_Cleaned.csv")
print("✓ Cleaned data loaded")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

✓ Cleaned data loaded
Shape: (1307, 8)
Columns: ['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']


In [2]:
# Display data types of each column
print("Data types:")
print(df.dtypes)

Data types:
pclass        int64
survived      int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object


In [3]:
# Check unique values in 'sex'
print("Unique values in 'sex':")
print(df['sex'].unique())               
print(f"Count: {df['sex'].nunique()}")  

# Check unique values in 'embarked'
print("\nUnique values in 'embarked':")
print(df['embarked'].unique())
print(f"Count: {df['embarked'].nunique()}")

Unique values in 'sex':
['female' 'male']
Count: 2

Unique values in 'embarked':
['S' 'C' 'Q']
Count: 3


In [4]:
# Show first few rows with 'sex' column
print("Before encoding:")
print(df[['sex', 'age', 'survived']].head())

Before encoding:
      sex    age  survived
0  female  29.00         1
1    male   0.92         1
2  female   2.00         0
3    male  30.00         0
4  female  25.00         0


In [5]:
# TODO: Use pd.get_dummies() to convert 'sex' to dummy variables
# Hint: df = pd.get_dummies(df, columns=['sex'], drop_first=True)
df = pd.get_dummies(df, columns=['sex'], drop_first=True)

print("✓ 'sex' converted to dummy variables!")
print(f"New columns: {df.columns.tolist()}")

✓ 'sex' converted to dummy variables!
New columns: ['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'sex_male']


In [6]:
# Show first few rows with new 'sex_male' column
print("After encoding:")
print(df[['sex_male', 'age', 'survived']].head(10))

After encoding:
   sex_male    age  survived
0     False  29.00         1
1      True   0.92         1
2     False   2.00         0
3      True  30.00         0
4     False  25.00         0
5      True  48.00         1
6     False  63.00         1
7      True  39.00         0
8     False  53.00         1
9      True  71.00         0


In [7]:
# Show first few rows with 'embarked' column
print("Before encoding:")
print(df[['embarked', 'fare', 'survived']].head())

Before encoding:
  embarked      fare  survived
0        S  211.3375         1
1        S  151.5500         1
2        S  151.5500         0
3        S  151.5500         0
4        S  151.5500         0


In [8]:
# TODO: Use pd.get_dummies() to convert 'embarked' to dummy variables
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

print("✓ 'embarked' converted to dummy variables!")
print(f"New columns: {df.columns.tolist()}")

✓ 'embarked' converted to dummy variables!
New columns: ['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']


In [9]:
# Show first few rows with new dummy columns
print("After encoding:")
print(df[['embarked_Q', 'embarked_S', 'fare', 'survived']].head(10))

After encoding:
   embarked_Q  embarked_S      fare  survived
0       False        True  211.3375         1
1       False        True  151.5500         1
2       False        True  151.5500         0
3       False        True  151.5500         0
4       False        True  151.5500         0
5       False        True   26.5500         1
6       False        True   77.9583         1
7       False        True    0.0000         0
8       False        True   51.4792         1
9       False       False   49.5042         0


In [10]:
# Count how many males vs females
print("Sex distribution:")
print(df['sex_male'].value_counts())
print(f"\nMales: {df['sex_male'].sum()}")
print(f"Females: {(df['sex_male'] == 0).sum()}")

# Count embarked locations
print("Embarked distribution:")
print(f"Embarked at Q: {df['embarked_Q'].sum()}")
print(f"Embarked at S: {df['embarked_S'].sum()}")
print(f"Embarked at C: {((df['embarked_Q'] == 0) & (df['embarked_S'] == 0)).sum()}")

Sex distribution:
sex_male
True     843
False    464
Name: count, dtype: int64

Males: 843
Females: 464
Embarked distribution:
Embarked at Q: 123
Embarked at S: 914
Embarked at C: 270


In [11]:
# Display dataset info
print("Final dataset after encoding:")
print(df.info())
print(f"\nColumns: {df.columns.tolist()}")

Final dataset after encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307 entries, 0 to 1306
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pclass      1307 non-null   int64  
 1   survived    1307 non-null   int64  
 2   age         1307 non-null   float64
 3   sibsp       1307 non-null   int64  
 4   parch       1307 non-null   int64  
 5   fare        1307 non-null   float64
 6   sex_male    1307 non-null   bool   
 7   embarked_Q  1307 non-null   bool   
 8   embarked_S  1307 non-null   bool   
dtypes: bool(3), float64(2), int64(4)
memory usage: 65.2 KB
None

Columns: ['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']


In [12]:
# TODO: Create X by dropping the 'survived' column
X = df.drop('survived', axis=1)

print("✓ X (features) created!")
print(f"X shape: {X.shape}")
print(f"X columns: {X.columns.tolist()}")

✓ X (features) created!
X shape: (1307, 8)
X columns: ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_male', 'embarked_Q', 'embarked_S']


In [13]:
# TODO: Create y by selecting only the 'survived' column
y = df['survived']

print("✓ y (target) created!")
print(f"y shape: {y.shape}")
print(f"y type: {type(y)}")

✓ y (target) created!
y shape: (1307,)
y type: <class 'pandas.core.series.Series'>


In [14]:
# Display first few rows of X
print("First 5 rows of X (features):")
print(X.head())
print("\nFirst 10 values of y (target):")
print(y.head(10).tolist())

First 5 rows of X (features):
   pclass    age  sibsp  parch      fare  sex_male  embarked_Q  embarked_S
0       1  29.00      0      0  211.3375     False       False        True
1       1   0.92      1      2  151.5500      True       False        True
2       1   2.00      1      2  151.5500     False       False        True
3       1  30.00      1      2  151.5500      True       False        True
4       1  25.00      1      2  151.5500     False       False        True

First 10 values of y (target):
[1, 1, 0, 0, 0, 1, 1, 0, 1, 0]


In [15]:
# Save X and y to CSV files
X.to_csv("Titanic_X_features.csv", index=False)
y.to_csv("Titanic_y_target.csv", index=False)
print("✓ X and y saved!")

✓ X and y saved!
