# This model is going to predict wheather a passenger on titanic survived or not 
## It is a problem statement from Kaggle

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load training data (adjust path as needed)
train_df = pd.read_csv('train.csv')

# Handle missing values
# Fill Age NaN with median age
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Fill Embarked NaN with most common value
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Fill Fare NaN with median fare
train_df['Fare'].fillna(train_df['Fare'].median(), inplace=True)

# Cabin has too many missing values, we'll drop it
train_df.drop('Cabin', axis=1, inplace=True)

# Convert categorical variables to numerical
label_encoder = LabelEncoder()
train_df['Sex'] = label_encoder.fit_transform(train_df['Sex'])
train_df['Embarked'] = label_encoder.fit_transform(train_df['Embarked'])

# Extract titles from names as a new feature
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train_df['Title'] = label_encoder.fit_transform(train_df['Title'])

# Drop unnecessary columns
train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme

In [2]:
# Create family size feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Create is alone feature
train_df['IsAlone'] = 0
train_df.loc[train_df['FamilySize'] == 1, 'IsAlone'] = 1

# Create age bins
train_df['AgeBin'] = pd.cut(train_df['Age'].astype(int), 5)
label_encoder = LabelEncoder()
train_df['AgeBin'] = label_encoder.fit_transform(train_df['AgeBin'])

# Create fare bins
train_df['FareBin'] = pd.qcut(train_df['Fare'], 4)
label_encoder = LabelEncoder()
train_df['FareBin'] = label_encoder.fit_transform(train_df['FareBin'])

# Drop original columns we binned
train_df.drop(['Age', 'Fare', 'SibSp', 'Parch'], axis=1, inplace=True)

In [3]:
# Separate features and target
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on validation set
y_pred = rf.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 0.82


In [9]:
# Load test data
test_df = pd.read_csv('test.csv')  # Update with your actual test file path

# Keep PassengerId for submission
passenger_ids = test_df['PassengerId']

# 1. Handle missing values (same as training)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# 2. Drop Cabin (same as training)
test_df.drop('Cabin', axis=1, inplace=True)

# 3. Convert categorical variables (same encoders as training)
test_df['Sex'] = label_encoder.fit_transform(test_df['Sex'])
test_df['Embarked'] = label_encoder.fit_transform(test_df['Embarked'])

# 4. Extract titles (same as training)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = label_encoder.fit_transform(test_df['Title'])

# 5. Drop unnecessary columns (same as training)
test_df.drop(['Name', 'Ticket'], axis=1, inplace=True)

# 6. Feature engineering (same as training)
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
test_df['IsAlone'] = 0
test_df.loc[test_df['FamilySize'] == 1, 'IsAlone'] = 1

# 7. Create bins (using same bin edges as training)
# For AgeBin - use the same cut points from training
age_bins = [0, 16, 32, 48, 64, 80]  # Example - use your actual training bins
test_df['AgeBin'] = pd.cut(test_df['Age'].astype(int), bins=age_bins, labels=False)

# For FareBin - use same quantiles as training
fare_bins = [-0.001, 7.91, 14.454, 31.0, 512.329]  # Example quantiles
test_df['FareBin'] = pd.cut(test_df['Fare'], bins=fare_bins, labels=False)

# 8. Drop original columns (same as training)
test_df.drop(['Age', 'Fare', 'SibSp', 'Parch'], axis=1, inplace=True)

# Ensure same column order as training data
# Get the columns in the same order as the training set (excluding 'Survived')
train_columns = [col for col in X_train.columns if col != 'Survived']
test_df = test_df[train_columns]

# Make predictions
test_predictions = rf.predict(test_df)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': test_predictions
})

# Save to CSV
submission.to_csv('titanic_submissions.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat