  # Assignment
  Use the Titanic dataset from Kaggle

  1️⃣ Explore the dataset

  Summarize numeric and categorical columns

  Identify missing values and unusual data types

  Visualize distributions for numeric features

  2️⃣ Handle missing values

  Decide how to treat missing values for each column

  Justify your approach (mean, median, mode, drop, or another method)

  Compare the dataset before and after imputation

  3️⃣ Detect and handle outliers

  Identify outliers in at least two numeric columns using any method(s) you choose

  Decide whether to remove, cap, or transform outliers, and justify your choice

  Visualize the results

  4️⃣ Encode categorical variables

  Decide which columns are nominal vs. ordinal

  Apply appropriate encoding (One-Hot, Label, or another)

  Explain why you chose this method

  5️⃣ Scale numeric features

  Apply scaling to at least two numeric columns

  Show before-and-after distributions and discuss how scaling affects the data

  6️⃣ Feature engineering

  Create at least one new feature derived from existing data

  Explain why this feature could be informative for modeling

  7️⃣ Feature selection / importance

  Analyze correlations or other relationships between features and the target

  Identify redundant or less informative features

  Explain your reasoning

  8️⃣ Train-Test Split & Data Leakage Awareness

  Split the data into training and test sets

  Explain how you avoided data leakage

  9️⃣ Pipeline

  Create a pipeline that combines at least three preprocessing steps

  Ensure it can be applied to new/unseen data without modifications

# Exploring dataset

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')
df

# Numerical and categorical values

In [None]:
for col in df.columns:
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        print(col + ' is numerical')
    else:
        print(col + ' is categorical')

In [None]:
# Dropping rows with missing values

In [None]:
df_dropped = df.dropna()
df_dropped

# Numerical distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1,3, figsize=(15,5))

sns.histplot(data=df, x='Age', kde=True, ax=axes[0], color = 'skyblue')
sns.histplot(data=df, x='Fare',kde=True, ax=axes[1], color = 'salmon')
sns.histplot(data=df, x= 'SibSp',kde=True, ax=axes[2], color = 'lightgreen')

# plt.tight_layout()
plt.show()

# Missing values

In [None]:
# Let`s see which columns contain nan/null values

for col in df.columns:
    # check if the column contains nan values
    if df[col].isnull().values.any():
        print(col)

In [None]:
# We are filling the Age column with the mean, because we dont have values to big
# that would affect, so we dont use median, also we dont want to loose data and drop them
df['Age'] = df['Age'].fillna(df['Age'].mean())


In [None]:
# let s check for duplicates in col cabin
df['Cabin'].value_counts()

In [None]:
# We have duplicates so we can assign a cabin more times
# Lets check how many missing values we have in col cabin
df['Cabin'].isnull().sum()

In [None]:
# We`ll put the Unknown value such that it doesnt affect us

# Fill missing values with random choices from unique_cabins
df['Cabin'] = df['Cabin'].apply(lambda x: 'Unknown' if pd.isnull(x) else x)

df['Cabin'].value_counts()

For the embarked column we will drop the columns with nan values, because it means that they didnt get on the titanic, and for the moment we will be working with the passengers that were embarked

In [None]:
df['Embarked'].value_counts()
df['Embarked'] = df['Embarked'].apply(lambda x: 'Unknown' if pd.isnull(x) else x)
df['Embarked'].value_counts()

# Outlier

Let`s search for outliers

In [None]:
print(df.columns)
for col in df.columns:
  if df[col].dtype == 'int64' or df[col].dtype == 'float64':
    print(col)

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['SibSp'], color="skyblue")
plt.title("SibSp Boxplot")

Something strange here, someone has a lot of spouses and siblings

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['Age'], color="skyblue")
plt.title("Age Boxplot")

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['Parch'], color="skyblue")
plt.title("Parch Boxplot")

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
sns.boxplot(y=df['Fare'], color="skyblue")
plt.title("Fare Boxplot")

Someone paid aaa lot

In [None]:
fig, axes = plt.subplots(1,4, figsize=(20,5))

sns.histplot(data=df, x='Age', kde=True, ax=axes[0], color = 'skyblue')
sns.histplot(data=df, x='Fare',kde=True, ax=axes[1], color = 'salmon')
sns.histplot(data=df, x= 'SibSp',kde=True, ax=axes[2], color = 'lightgreen')
sns.histplot(data=df, x= 'Parch',kde=True, ax=axes[3], color = 'red')

plt.tight_layout()
plt.show()

We can see that we have some outliers for:

Fare , one value 500 and some between 200 and 300

Parch value 4

SibSp value 3

Let`s also try IQR for finding the outliers, which is what boxplots are using

In [None]:
# Detect outliers using IQR

# For SibSp
Q1 = df['SibSp'].quantile(0.25)
Q3 = df['SibSp'].quantile(0.75)
IQR = Q3 - Q1
print(Q3 ,'-',Q1 ,'=',IQR)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['SibSp'] < lower_bound) | (df['SibSp'] > upper_bound)]
print("\nDetected outliers in 'SibSp':")
print(outliers['SibSp'],'\n')

print(upper_bound, lower_bound)
# print(outliers)

In [None]:
df_clean = df[(df['SibSp'] >= lower_bound) & (df['SibSp'] <= upper_bound)]
print("\nData after removing outliers:")
plt.figure(figsize=(10,4))
plt.subplot(1, 2, 1)
sns.boxplot(y=df_clean['SibSp'], color="skyblue")
plt.title("SibSp Boxplot")

In [None]:
numerical_columns = ['Age', 'Fare', 'SibSp', 'Parch']
df_clean = df
for column in numerical_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    print(f"\nDetected outliers in '{column}':")
    print(outliers[column])
    print(f"\n Upper bound and lower bound for '{column}'")
    print(upper_bound, lower_bound)
    df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]
    print("\nData after removing outliers:")
    plt.figure(figsize=(10,4))
    plt.subplot(1, 2, 1)
    sns.boxplot(y=df_clean[column], color="skyblue")
    plt.title(f"{column} Boxplot")


For the fare i dont want to keep that upper bounds, i would like to remove only the ones above 500, therefore i am going to put back the values with fare lower than 263

In [None]:
print(df_clean['Fare'].count())
print(df_clean['Fare'].max())
print(df_clean['Fare'].min())
print(df_clean.count())
df_clean= df[(df['Fare'] < 263)]
print(df_clean.count())

# Feature scaling

I am going to use Standardization

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd

# Select only numerical columns for scaling
numerical_cols = df_clean.select_dtypes(include=np.number).columns
scaler_std = StandardScaler()

# Apply StandardScaler to the numerical columns
df_std = pd.DataFrame(scaler_std.fit_transform(df_clean[numerical_cols]), columns=numerical_cols)

print("\nAfter Standardization:")
print(df_std)

# Encoding the categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
# Encoding Name column
df_clean['Name_Encoded'] = label_encoder.fit_transform(df_clean['Name'])
# Encoding Sex column
df_clean['Sex_Encoded'] = label_encoder.fit_transform(df_clean['Sex'])
# Encoding Ticket column
df_clean['Ticket_Encoded'] = label_encoder.fit_transform(df_clean['Ticket'])
# Encoding Cabin column
df_clean['Cabin_Encoded'] = label_encoder.fit_transform(df_clean['Cabin'])
# Encoding Embarked column
df_clean['Embarked_Encoded'] = label_encoder.fit_transform(df_clean['Embarked'])
print("\nAfter Label Encoding :")
print(df_clean)


In [None]:
# Are there any names repeating ?
print(df["Name"].value_counts())

In [None]:
print(df_clean["Ticket"].value_counts())
# df_clean

# Standardization after Encoding

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandas as pd

# Select only numerical columns for scaling
numerical_cols = df_clean.select_dtypes(include=np.number).columns
scaler_std = StandardScaler()

# Apply StandardScaler to the numerical columns
df_std = pd.DataFrame(scaler_std.fit_transform(df_clean[numerical_cols]), columns=numerical_cols)

print("\nAfter Standardization:")
print(df_std)

# Feature Selection
For the moment we`re going to trim down the Name and Ticket Number

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression

X = df_clean[['Pclass', 'Age', 'SibSp','Parch','Fare','Sex_Encoded','Embarked_Encoded']]
y = df_clean['Survived']

selector = SelectKBest(score_func=f_regression, k=4)
X_selected = selector.fit_transform(X, y)

selected_features = X.columns[selector.get_support()]
print("\nSelected Features:")
print(selected_features)

We can deduce that the survival chance was determined by these columns.
I would`ve said that Parch would be more significant than the SibSp, let s check the correlations between:
1. Parch and survival
2. SibSp and survival

In [None]:
correlation_parch_survived = df_clean['Parch'].corr(df_clean['Survived'])
correlation_sibsp_survived = df_clean['SibSp'].corr(df_clean['Survived'])

print(f"Correlation between Parch and Survived: {correlation_parch_survived}")
print(f"Correlation between SibSp and Survived: {correlation_sibsp_survived}")

We can notice that the number of parents children doesnt tell us as much as the number of siblings of spouses.  
The higher the SibSp the fewer chances for survival

# Feature engineering

In [None]:
# Create 'FamilySize' feature
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1 # Add 1 for the passenger himself/herself

# Extract 'Title' from the 'Name' column
df_clean['Title'] = df_clean['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Create 'IsAlone' feature
df_clean['IsAlone'] = (df_clean['FamilySize'] == 1).astype(int)

# Extract 'Deck' from the 'Cabin' column
df_clean['Deck'] = df_clean['Cabin'].str.get(0)


# Display the first few rows with the new features
display(df_clean[['Name', 'SibSp', 'Parch', 'FamilySize', 'Title', 'IsAlone', 'Cabin', 'Deck']].head())

let`s encode the deck column and see if it tells us more about survival

In [None]:
df_clean['Deck_Encoded'] = label_encoder.fit_transform(df_clean['Deck'])
# df_clean
df_clean['Title_Encoded'] = label_encoder.fit_transform(df_clean['Title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Deck_Encoded'] = label_encoder.fit_transform(df_clean['Deck'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Title_Encoded'] = label_encoder.fit_transform(df_clean['Title'])


In [None]:
correlation_deck_survied = df_clean['Survived'].corr(df_clean['Deck_Encoded'])
print(correlation_deck_survied)

-0.2990480270470364


In [None]:
correlation_title_survied = df_clean['Survived'].corr(df_clean['Title_Encoded'])
print(correlation_title_survied)

-0.19828598544241705


In [None]:
correlation_FamilySize_survied = df_clean['Survived'].corr(df_clean['FamilySize'])
print(correlation_FamilySize_survied)

0.015688104861744885


In [None]:
correlation_IsAlone_survied = df_clean['Survived'].corr(df_clean['IsAlone'])
print(correlation_IsAlone_survied)

-0.20466982429394068


In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# Including newly engineered features
X = df_clean[['Pclass', 'Age', 'SibSp','Parch','Fare','Sex_Encoded','Embarked_Encoded', 'Deck_Encoded', 'FamilySize', 'IsAlone', 'Title_Encoded']]
y = df_clean['Survived']

# Using f_regression as the scoring function for regression tasks
selector = SelectKBest(score_func=f_regression, k='all') # Select all features to see their scores
X_selected = selector.fit_transform(X, y)

# Get the scores for each feature
feature_scores = pd.DataFrame({'Feature': X.columns, 'Score': selector.scores_})
feature_scores = feature_scores.sort_values(by='Score', ascending=False)

print("\nFeature Importance Scores:")
print(feature_scores)

selected_features = X.columns[selector.get_support()]
# print("\nSelected Features:")
# print(selected_features)


Feature Importance Scores:
             Feature       Score
5        Sex_Encoded  369.827666
0             Pclass  111.539892
7       Deck_Encoded   86.623753
4               Fare   73.259227
9            IsAlone   38.562104
10     Title_Encoded   36.097130
6   Embarked_Encoded   22.521025
3              Parch    6.250683
1                Age    4.269290
2              SibSp    1.268042
8         FamilySize    0.217128


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.countplot(x='Deck', hue='Survived', data=df_clean, palette='viridis')
plt.title('Survival Counts by Deck')
plt.xlabel('Deck')
plt.ylabel('Count')
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

corr = df_clean.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.show()


NameError: name 'df_clean' is not defined

# Splitting training data

In [None]:
from sklearn.model_selection import train_test_split
# Features and target
X = df_clean[['Pclass', 'Fare','Sex_Encoded', 'Deck_Encoded']]
y = df_clean['Survived']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training Features:")
print(X_train)
print("\nTesting Features:")
print(X_test)

Training Features:
     Pclass      Fare  Sex_Encoded  Deck_Encoded
326       3    6.2375            1             8
802       1  120.0000            1             1
92        1   61.1750            1             4
835       1   83.1583            0             4
183       2   39.0000            1             5
..      ...       ...          ...           ...
108       3    7.8958            1             8
273       1   29.7000            1             2
867       1   50.4958            1             0
440       2   26.2500            0             8
104       3    7.9250            1             8

[618 rows x 4 columns]

Testing Features:
     Pclass     Fare  Sex_Encoded  Deck_Encoded
45        3   8.0500            1             8
392       3   7.9250            1             8
643       3  56.4958            1             8
746       3  20.2500            1             8
886       2  13.0000            1             8
..      ...      ...          ...           ...
367       3   

Let`s apply scaling to our data

In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # only transform on test set

print("\nScaled Training Features:")
print(X_train_scaled)
print("\nScaled Testing Features:")
print(X_test_scaled)


Scaled Training Features:
[[ 0.7958527  -0.59412274  0.70968277  0.49574645]
 [-1.65719951  2.49957309  0.70968277 -2.48286348]
 [-1.65719951  0.89986618  0.70968277 -1.20631636]
 ...
 [-1.65719951  0.60945241  0.70968277 -2.90837918]
 [-0.43067341 -0.04989606 -1.40908028  0.49574645]
 [ 0.7958527  -0.54823229  0.70968277  0.49574645]]

Scaled Testing Features:
[[ 0.7958527  -0.544833    0.70968277  0.49574645]
 [ 0.7958527  -0.54823229  0.70968277  0.49574645]
 [ 0.7958527   0.77261844  0.70968277  0.49574645]
 ...
 [-1.65719951  0.07100453  0.70968277 -2.90837918]
 [ 0.7958527   0.51166701 -1.40908028  0.49574645]
 [-0.43067341 -0.11108332  0.70968277  0.49574645]]


# Pipelines and model

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

I am going to use the following features:
1. Numerical features: Pclass, Fare, Deck
2. Categorical features: Sex ( i`ll use label encoder)

# Preprocessor


In [None]:
X = df_clean[['Pclass', 'Fare','Sex', 'Deck']]
y = df_clean['Survived']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#Preprocessing steps

numeric_features = ['Pclass', 'Fare']
numeric_transformer = StandardScaler()

categorical_features = ['Sex', 'Deck']
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Pclass', 'Fare']),
        ('cat', OneHotEncoder(), ['Sex','Deck'])
    ])

In [None]:
# Pipeline with preprocessing + model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Train the model
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
print("Predictions:", y_pred)

Predictions: [ 1.00457461e-01  1.00475848e-01  9.33311887e-02  9.86628673e-02
  2.48262385e-01  9.14109442e-02  3.35418212e-01  9.94076808e-01
  2.45751307e-01  7.38046456e-01  1.00244169e-01  9.95154169e-02
  1.00448267e-01  7.38540455e-01  9.65959180e-01  1.00603941e-01
  6.50610752e-01  5.86951439e-01  5.90411916e-01  1.13271018e+00
  1.00441530e-01  1.00480143e-01  9.95154169e-02  2.47968189e-01
  5.90386174e-01  5.87805828e-01  2.44768811e-01  4.20533956e-01
  8.76967892e-01  3.33403205e-01  9.02288943e-01  3.41309481e-01
  1.00480143e-01  5.84653036e-01  7.36701732e-01  5.90158172e-01
  1.00480143e-01  5.90411916e-01  6.42799857e-01  1.00480143e-01
  5.90250108e-01  1.00457461e-01  1.00420686e-01  1.00578817e-01
  8.12832577e-01  5.90408238e-01  8.12551253e-01  9.16560492e-01
  3.41309481e-01  5.88131899e-01  1.00648688e-01  2.46096987e-01
  2.48357998e-01  2.48262385e-01  3.22922252e-01  9.19764165e-01
  9.95086798e-02  6.48852933e-01  1.00367363e-01  8.15173889e-01
  1.00575139

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Convert continuous predictions to binary (0 or 1) using a threshold
y_pred_binary = (y_pred > 0.5).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
conf_matrix = confusion_matrix(y_test, y_pred_binary)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.7932
Precision: 0.7477
Recall: 0.7407
F1-score: 0.7442

Confusion Matrix:
[[131  27]
 [ 28  80]]
