<a href="https://colab.research.google.com/github/DhirajJakka/Built-it-/blob/main/Titanic_Survival_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load the Titanic dataset
df = pd.read_csv('/content/tested.csv')

print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None


In [None]:

df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = np.where(df['FamilySize'] == 1, 1, 0)
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]


In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

In [None]:

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [None]:
# Split the dataset into training and testing sets
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Random Forest
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5)
rf_grid.fit(X_train, y_train)

# XGBoost
xgb_params = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3]}
xgb_grid = GridSearchCV(XGBClassifier(random_state=42), xgb_params, cv=5)
xgb_grid.fit(X_train, y_train)


In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


In [None]:


# Assuming X is a DataFrame
nan_columns = X.columns[X.isnull().any()]
nan_rows = X[X.isnull().any(axis=1)]

print("Columns with NaN values:", nan_columns)
print("Rows with NaN values:", nan_rows)


Columns with NaN values: Index([], dtype='object')
Rows with NaN values: Empty DataFrame
Columns: [PassengerId, Pclass, Age, SibSp, Parch, Fare, FamilySize, IsAlone, Sex_male, Embarked_Q, Embarked_S, Title_Dona, Title_Dr, Title_Master, Title_Miss, Title_Mr, Title_Mrs, Title_Ms, Title_Rev]
Index: []


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Create and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_imputed, y, cv=5)
print("Cross-validated accuracy:", scores.mean())


Cross-validated accuracy: 1.0


In [None]:
# Random Forest evaluation
rf_pred = rf_grid.predict(X_test)
print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))
print("Classification Report:\n", classification_report(y_test, rf_pred))

# XGBoost evaluation
xgb_pred = xgb_grid.predict(X_test)
print("\nXGBoost:")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_pred))
print("Classification Report:\n", classification_report(y_test, xgb_pred))


Random Forest:
Accuracy: 1.0
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84


XGBoost:
Accuracy: 1.0
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [None]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[('rf', rf_grid), ('xgb', xgb_grid)], voting='soft')
voting_clf.fit(X_train, y_train)

# Evaluate the ensemble model
ensemble_pred = voting_clf.predict(X_test)
print("\nEnsemble Voting:")
print("Accuracy:", accuracy_score(y_test, ensemble_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, ensemble_pred))
print("Classification Report:\n", classification_report(y_test, ensemble_pred))




Ensemble Voting:
Accuracy: 1.0
Confusion Matrix:
 [[50  0]
 [ 0 34]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

