# Titanic Dataset - Exploratory Data Analysis
**Objective:** To analyze the Titanic passenger data to uncover patterns that affected survival outcomes.

[Kaggle Titanic Dataset](https://www.kaggle.com/c/titanic/data)

In [None]:
# Install required libraries (if not available)
!pip install seaborn scikit-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score

In [27]:
# Download the data files from Kaggle or upload them directly in Colab
train = pd.read_csv('/content/train.csv')  # Adjust your path or use upload widget
test = pd.read_csv('/content/test.csv')

In [None]:
# Preview data
display(train.head())
train.info()
train.describe()

# Checking missing values
print(train.isnull().sum())

# Visualizing survival counts
sns.countplot(x='Survived', data=train)
plt.title('Survival Counts')
plt.show()

# Survival by Sex
sns.countplot(x='Sex', hue='Survived', data=train)
plt.title('Survival by Sex')
plt.show()

# Survival by Pclass
sns.countplot(x='Pclass', hue='Survived', data=train)
plt.title('Survival by Passenger Class')
plt.show()

# Age distribution
train['Age'].hist(bins=30, edgecolor='k')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Fare distribution
train['Fare'].hist(bins=30, edgecolor='k')
plt.title('Fare Distribution')
plt.xlabel('Fare')
plt.ylabel('Count')
plt.show()

In [29]:
# Fill missing Age with median
train['Age'] = train['Age'].fillna(train['Age'].median())
test['Age'] = test['Age'].fillna(test['Age'].median())

# Fill Embarked with mode
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(test['Embarked'].mode()[0])

# Fill missing Fare in test set
test['Fare'] = test['Fare'].fillna(test['Fare'].median())

# Drop Cabin (too many missing values)
train = train.drop('Cabin', axis=1)
test = test.drop('Cabin', axis=1)


In [30]:
# Create FamilySize feature
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# Encode Sex
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

# Encode Embarked
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

In [31]:
target = 'Survived'
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']

X = train[features]
y = train[target]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_val)

print("Logistic Regression Results:")
print(classification_report(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)

print("\nDecision Tree Results:")
print(classification_report(y_val, y_pred_dt))
print("Accuracy:", accuracy_score(y_val, y_pred_dt))

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

print("\nRandom Forest Results:")
print(classification_report(y_val, y_pred_rf))
print("Accuracy:", accuracy_score(y_val, y_pred_rf))

In [None]:
cv_scores = cross_val_score(rf, X, y, cv=5)
print("Random Forest Cross-Val Accuracy:", np.mean(cv_scores))

In [None]:
importances = rf.feature_importances_
feat_importance = pd.Series(importances, index=features).sort_values(ascending=False)
feat_importance.plot(kind='bar')
plt.title('Feature Importances')
plt.show()

In [None]:
y_test_pred = rf.predict(test[features])
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': y_test_pred
})
submission.to_csv('submission.csv', index=False)
print('Submission file created.')

##  Insights and Discussion:
1. Sex, Pclass, and Fare are the most influential features for predicting survival.

2. Women, younger passengers, and those with higher class/fare had higher survival rates.

3. Extensive EDA combined with feature engineering and robust modeling yields strong predictive performance.