In [8]:
import pandas as pd
from sklearn.impute import SimpleImputer  # Ensure SimpleImputer is imported

In [1]:
# Re-import necessary library after environment reset
import pandas as pd

# File path for the Titanic dataset
file_path = '/Users/manuel/Desktop/DI-Bootcamp/Week4_PreProcessing_Data/D4/ExerciseXP/Titanic/train.csv'

# Load the Titanic dataset
titanic_df = pd.read_csv(file_path)
print(titanic_df.head())

# Check the number of rows before removing duplicates
rows_before = len(titanic_df)




   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


(891, 891)

In [5]:
# Identify duplicates based on all columns
duplicates = titanic_df.duplicated()
duplicates

# Remove duplicate rows
titanic_df_cleaned = titanic_df.drop_duplicates()
titanic_df_cleaned

# Check the number of rows after removing duplicates
rows_after = len(titanic_df_cleaned)
print(rows_after)

# Display the number of rows before and after duplicate removal
(rows_before, rows_after)

891


(891, 891)

In [11]:
# Exercise 2: Handling Missing Values

# Step 1: Identify columns with missing values
missing_values = titanic_df.isnull().sum()

# Step 2: Apply different strategies for handling missing data

# Strategy 1: Remove rows with missing 'Cabin' data
titanic_dropna = titanic_df.dropna(subset=['Cabin'])
#print(titanic_dropna)

# Strategy 2: Fill missing 'Age' values with the median
titanic_fillna_age = titanic_df.copy()
titanic_fillna_age['Age'] = titanic_fillna_age['Age'].fillna(titanic_fillna_age['Age'].median())

# Strategy 3: Use SimpleImputer to fill missing 'Embarked' values with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
titanic_imputed_embarked = titanic_df.copy()
titanic_imputed_embarked['Embarked'] = imputer.fit_transform(titanic_imputed_embarked[['Embarked']]).ravel()

# Display the results of missing value handling
missing_values, titanic_dropna.shape, titanic_fillna_age['Age'].isnull().sum(), titanic_imputed_embarked['Embarked'].isnull().sum()

(PassengerId      0
 Survived         0
 Pclass           0
 Name             0
 Sex              0
 Age            177
 SibSp            0
 Parch            0
 Ticket           0
 Fare             0
 Cabin          687
 Embarked         2
 dtype: int64,
 (204, 12),
 np.int64(0),
 np.int64(0))

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the Titanic dataset
file_path = '/Users/manuel/Desktop/DI-Bootcamp/Week4_PreProcessing_Data/D4/ExerciseXP/Titanic/train.csv'
titanic_df = pd.read_csv(file_path)

# Exercise 3: Feature Engineering

# Step 1: Create new features

# Create 'FamilySize' from 'SibSp' (Number of siblings/spouses aboard) and 'Parch' (Number of parents/children aboard)
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1  # +1 to include the passenger themselves

# Extract 'Title' from the 'Name' column
titanic_df['Title'] = titanic_df['Name'].apply(lambda name: name.split(', ')[1].split('.')[0])

# Step 2: Convert categorical variables into numerical form

# Convert 'Sex' to numerical form using Label Encoding
label_encoder = LabelEncoder()
titanic_df['Sex_encoded'] = label_encoder.fit_transform(titanic_df['Sex'])

# One-hot encode 'Embarked' column
titanic_df = pd.get_dummies(titanic_df, columns=['Embarked'], prefix='Embarked')

# Step 3: Normalize or standardize numerical features if required

# Standardize 'Age' and 'Fare' using StandardScaler
scaler = StandardScaler()
titanic_df[['Age_scaled', 'Fare_scaled']] = scaler.fit_transform(titanic_df[['Age', 'Fare']])

# Display the modified DataFrame
print(titanic_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin  FamilySize Title  Sex_encoded  \
0      0         A/5 21171   7.2500   NaN           2    Mr            1   
1      0          PC 17599  71.2833   C85           2   Mrs            0   
2      0  STON/O2. 3101282   7.9250   NaN           1  Miss            0   
3      0        

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the Titanic dataset
file_path = '/Users/manuel/Desktop/DI-Bootcamp/Week4_PreProcessing_Data/D4/ExerciseXP/Titanic/train.csv'
titanic_df = pd.read_csv(file_path)

# Exercise 4: Outlier Detection and Handling

# Step 1: Detect outliers using IQR (Interquartile Range) for 'Fare' and 'Age'

# Calculate Q1 (25th percentile) and Q3 (75th percentile) for 'Fare' and 'Age'
Q1_fare = titanic_df['Fare'].quantile(0.25)
Q3_fare = titanic_df['Fare'].quantile(0.75)
IQR_fare = Q3_fare - Q1_fare

Q1_age = titanic_df['Age'].quantile(0.25)
Q3_age = titanic_df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age

# Define outlier thresholds for 'Fare' and 'Age'
fare_lower_bound = Q1_fare - 1.5 * IQR_fare
fare_upper_bound = Q3_fare + 1.5 * IQR_fare

age_lower_bound = Q1_age - 1.5 * IQR_age
age_upper_bound = Q3_age + 1.5 * IQR_age

# Identify outliers in 'Fare' and 'Age'
fare_outliers = titanic_df[(titanic_df['Fare'] < fare_lower_bound) | (titanic_df['Fare'] > fare_upper_bound)]
age_outliers = titanic_df[(titanic_df['Age'] < age_lower_bound) | (titanic_df['Age'] > age_upper_bound)]

# Step 2: Handle the outliers

# Strategy: Capping the outliers to the upper and lower bounds
titanic_df['Fare'] = np.where(titanic_df['Fare'] > fare_upper_bound, fare_upper_bound, 
                              np.where(titanic_df['Fare'] < fare_lower_bound, fare_lower_bound, titanic_df['Fare']))

titanic_df['Age'] = np.where(titanic_df['Age'] > age_upper_bound, age_upper_bound, 
                             np.where(titanic_df['Age'] < age_lower_bound, age_lower_bound, titanic_df['Age']))

# Step 3: Assess the impact of outlier handling
fare_after_outliers = titanic_df[(titanic_df['Fare'] < fare_lower_bound) | (titanic_df['Fare'] > fare_upper_bound)]
age_after_outliers = titanic_df[(titanic_df['Age'] < age_lower_bound) | (titanic_df['Age'] > age_upper_bound)]

# Display results of outlier handling
(len(fare_outliers), len(age_outliers)), (len(fare_after_outliers), len(age_after_outliers))


((116, 11), (0, 0))

In [15]:
#CHECK!!!
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the Titanic dataset
file_path = '/Users/manuel/Desktop/DI-Bootcamp/Week4_PreProcessing_Data/D4/ExerciseXP/Titanic/train.csv'
titanic_df = pd.read_csv(file_path)

# Exercise 5: Data Standardization and Normalization

# Step 1: Assess the scale and distribution of numerical columns
# Select numerical columns for standardization and normalization
numerical_cols = ['Age', 'Fare']

# Step 2: Apply standardization to features with a wide range of values using StandardScaler
scaler_standard = StandardScaler()
titanic_df[['Age_standardized', 'Fare_standardized']] = scaler_standard.fit_transform(titanic_df[numerical_cols])

# Step 3: Apply normalization to features that require a bounded range [0, 1] using MinMaxScaler
scaler_minmax = MinMaxScaler()
titanic_df[['Age_normalized', 'Fare_normalized']] = scaler_minmax.fit_transform(titanic_df[numerical_cols])

# Display the first few rows of the modified DataFrame
print(titanic_df[['Age', 'Fare', 'Age_standardized', 'Fare_standardized', 'Age_normalized', 'Fare_normalized']].head())


    Age     Fare  Age_standardized  Fare_standardized  Age_normalized  \
0  22.0   7.2500         -0.530377          -0.502445        0.271174   
1  38.0  71.2833          0.571831           0.786845        0.472229   
2  26.0   7.9250         -0.254825          -0.488854        0.321438   
3  35.0  53.1000          0.365167           0.420730        0.434531   
4  35.0   8.0500          0.365167          -0.486337        0.434531   

   Fare_normalized  
0         0.014151  
1         0.139136  
2         0.015469  
3         0.103644  
4         0.015713  


In [16]:
categorical_columns = ['Sex', 'Embarked']

# Step 2: Use one-hot encoding for nominal variables
# 'Embarked' is a nominal variable
titanic_df = pd.get_dummies(titanic_df, columns=['Embarked'], prefix='Embarked')

# Step 3: Use label encoding for ordinal variables
# 'Sex' is a binary categorical variable, can use label encoding
label_encoder = LabelEncoder()
titanic_df['Sex_encoded'] = label_encoder.fit_transform(titanic_df['Sex'])

# Display the modified DataFrame with encoded features
print(titanic_df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin  Age_standardized  \
0      0         A/5 21171   7.2500   NaN         -0.530377   
1      0          PC 17599  71.2833   C85          0.571831   
2      0  STON/O2. 3101282   7.9250   NaN         -0.254825   
3      0            113803  53.1000  C123          0.365167   
4    

In [17]:
import pandas as pd

# Load the Titanic dataset
file_path = '/Users/manuel/Desktop/DI-Bootcamp/Week4_PreProcessing_Data/D4/ExerciseXP/Titanic/train.csv'
titanic_df = pd.read_csv(file_path)

# Exercise 7: Data Transformation for Age Feature

# Step 1: Create age groups (bins) from the 'Age' column
# Define age bins and corresponding labels
age_bins = [0, 12, 18, 35, 60, 100]
age_labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']

# Use pd.cut() to categorize 'Age' into bins
titanic_df['AgeGroup'] = pd.cut(titanic_df['Age'], bins=age_bins, labels=age_labels)

# Step 2: Apply one-hot encoding to the age groups to convert them into binary features
age_dummies = pd.get_dummies(titanic_df['AgeGroup'], prefix='AgeGroup')

# Integrate the encoded features back into the main dataset
titanic_df = pd.concat([titanic_df, age_dummies], axis=1)

# Display the modified DataFrame with age groups and one-hot encoded features
print(titanic_df[['Age', 'AgeGroup'] + list(age_dummies.columns)].head())


    Age     AgeGroup  AgeGroup_Child  AgeGroup_Teenager  AgeGroup_Young Adult  \
0  22.0  Young Adult           False              False                  True   
1  38.0        Adult           False              False                 False   
2  26.0  Young Adult           False              False                  True   
3  35.0  Young Adult           False              False                  True   
4  35.0  Young Adult           False              False                  True   

   AgeGroup_Adult  AgeGroup_Senior  
0           False            False  
1            True            False  
2           False            False  
3           False            False  
4           False            False  
