In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('/content/titanic.csv')
data.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.25,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0


# 1. Handling Missing Values

Remove Missing Values:

In [3]:
# Drop rows with missing values
data.dropna(inplace=True)

# Drop columns with missing values
data.dropna(axis=1, inplace=True)

Imputation (Filling Missing Values):

In [None]:
# Fill missing values with mean
data['Column'].fillna(data['Column'].mean(), inplace=True)

# 2. Encoding Categorical Data

Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

One-Hot Encoding

In [None]:
# Convert categorical column to one-hot encoding
data = pd.get_dummies(data, columns=['Gender'])

Binary Encoding

In [None]:
#the dataset her doesn't have City column.This is just the reference code
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['City'])
data = encoder.fit_transform(data)
print(data)

# 3. Feature Scaling

Standardization (Z-score Normalization)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])

Normalization (Min-Max Scaling)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])

# 4. Outlier Detection and Treatment

IQR (Interquartile Range) Method

In [None]:
# Remove outliers based on IQR
Q1 = data['Age'].quantile(0.25)
Q3 = data['Age'].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data['Age'] < (Q1 - 1.5 * IQR)) | (data['Age'] > (Q3 + 1.5 * IQR)))]

Transforming Data

In [None]:
# Log transformation
data['Age'] = data['Age'].apply(lambda x: np.log(x) if x > 0 else 0)

# 5. Data Transformation

Square Root or Cube Root Transformation

In [None]:
data['Fare'] = np.sqrt(data['Fare'])  # Or np.cbrt for cube root

Binning

In [None]:
data['Age_Bin'] = pd.cut(data['Age'], bins=[0, 25, 45, 100], labels=['Young', 'Middle-aged', 'Senior'])

# 6. Feature Extraction and Dimensionality Reduction

Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(data)

Feature Selection Techniques

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
X_new = SelectKBest(f_classif, k=10).fit_transform(X, y)  # Selecting top 10 features


# 7. Feature Encoding

TF-IDF (Term Frequency-Inverse Document Frequency)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)