<a href="https://colab.research.google.com/github/Adithya-5588/Immersivify_ds/blob/main/DataScienceAssignmenetTask1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [44]:
class TitanicSurvivalPredictor:
    def __init__(self):
        self.model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        )
        self.scaler = StandardScaler()
        self.label_encoders = {}

    def preprocess_data(self, df):
        # Create a copy to avoid modifying original data
        data = df.copy()

        # Extract titles from names
        data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

        # Group rare titles
        title_mapping = {
            'Mr': 'Mr',
            'Miss': 'Miss',
            'Mrs': 'Mrs',
            'Master': 'Master',
            'Dr': 'Rare',
            'Rev': 'Rare',
            'Col': 'Rare',
            'Major': 'Rare',
            'Mlle': 'Miss',
            'Countess': 'Rare',
            'Ms': 'Miss',
            'Lady': 'Rare',
            'Sir': 'Rare',
            'Mme': 'Mrs',
            'Don': 'Rare',
            'Capt': 'Rare',
            'Jonkheer': 'Rare',
            'Dona': 'Rare'
        }
        data['Title'] = data['Title'].map(lambda x: title_mapping.get(x, 'Rare'))

        # Extract deck from cabin
        data['Deck'] = data['Cabin'].str[0]

        # Feature engineering
        data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
        data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

        # Select features for the model
        features = ['Pclass', 'Sex', 'Age', 'Fare', 'Title', 'IsAlone',
                   'FamilySize', 'Embarked', 'Deck']

        # Create feature matrix
        X = data[features].copy()

        # Handle missing values
        numeric_features = ['Age', 'Fare']
        categorical_features = ['Sex', 'Embarked', 'Title', 'Deck']

        # Impute numeric features
        imputer = SimpleImputer(strategy='median')
        X[numeric_features] = imputer.fit_transform(X[numeric_features])

        # Impute categorical features
        for feature in categorical_features:
            X[feature].fillna('missing', inplace=True)

        # Encode categorical variables
        for feature in categorical_features:
            if feature not in self.label_encoders:
                self.label_encoders[feature] = LabelEncoder()
                X[feature] = self.label_encoders[feature].fit_transform(X[feature])
            else:
                X[feature] = self.label_encoders[feature].transform(X[feature])

        return X

    def train(self, X_train, y_train):
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)

        # Train the model
        self.model.fit(X_train_scaled, y_train)

        # Calculate feature importance
        feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)

        return feature_importance

    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X)

    def evaluate(self, X, y_true):
        y_pred = self.predict(X)

        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        report = classification_report(y_true, y_pred)
        conf_matrix = confusion_matrix(y_true, y_pred)

        return {
            'accuracy': accuracy,
            'classification_report': report,
            'confusion_matrix': conf_matrix
        }


In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
# Load and prepare data
data = pd.read_csv('https://drive.google.com/uc?id=1JzSsPgdhJVuDHVwXAYlK13CRVHIMqWaz&export=download')
predictor = TitanicSurvivalPredictor()

In [47]:
# Split features and target
X = predictor.preprocess_data(data)
y = data['Survived']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[feature].fillna('missing', inplace=True)


In [48]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [49]:
# Train the model and get feature importance
feature_importance = predictor.train(X_train, y_train)

# Evaluate the model
evaluation_results = predictor.evaluate(X_test, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
# Print results
print("\nModel Performance:")
print(f"Accuracy: {evaluation_results['accuracy']:.4f}")
print("\nClassification Report:")
print(evaluation_results['classification_report'])
print("\nFeature Importance:")
print(feature_importance)


Model Performance:
Accuracy: 0.5952

Classification Report:
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        50
           1       0.00      0.00      0.00        34

    accuracy                           0.60        84
   macro avg       0.30      0.50      0.37        84
weighted avg       0.35      0.60      0.44        84


Feature Importance:
      feature  importance
1         Sex    0.743453
4       Title    0.196293
3        Fare    0.021809
2         Age    0.013160
6  FamilySize    0.007283
8        Deck    0.006066
5     IsAlone    0.005528
0      Pclass    0.003524
7    Embarked    0.002884
