In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib

# Load Dataset

In [2]:
file_path = 'file.csv'  
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,Title,Location,Model,KM,Fuel Type,CC,Specs,Price
0,Honda Accord 2005 CL7 for Sale,Karachi,2005,110000,Petrol,2000.0,Automatic,23lacs
1,Hyundai Sonata 2021 2.5 for Sale,Gujranwala,2021,45000,Petrol,2500.0,Automatic,88lacs
2,Honda Civic Rebirth 2013 VTi Oriel 1.8 i-VTEC ...,Lahore,2013,170000,Petrol,1800.0,Manual,24.75lacs
3,Suzuki Cultus 2016 Limited Edition for Sale,Lahore,2016,96000,Petrol,1000.0,Manual,15.45lacs
4,Suzuki Alto 2019 VXR for Sale,Karachi,2019,55000,Petrol,660.0,Manual,22.25lacs


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3695 entries, 0 to 3694
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Title      3695 non-null   object 
 1   Location   3695 non-null   object 
 2   Model      3695 non-null   int64  
 3   KM         3695 non-null   object 
 4   Fuel Type  3695 non-null   object 
 5   CC         3695 non-null   float64
 6   Specs      3695 non-null   object 
 7   Price      3639 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 231.1+ KB


# Handle Missing Values

In [5]:
print("Missing Values in Each Column:")
print(data.isnull().sum())

data['Age'] = data['Age'].fillna(data['Age'].median())

data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

print("Missing Values After Imputation:")
print(data.isnull().sum())

Missing Values in Each Column:
Title         0
Location      0
Model         0
KM            0
Fuel Type     0
CC            0
Specs         0
Price        56
dtype: int64


KeyError: 'Age'

# Drop unnecessary columns

In [None]:
data.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)

print("\nDataset After Cleaning:")
data.head()


Dataset After Cleaning:


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


# Target and Features

In [None]:
target = 'Survived'
X = data.drop(columns=[target])
y = data[target]

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("\nNumeric Features:", numeric_features)
print("Categorical Features:", categorical_features)


Numeric Features: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical Features: ['Sex', 'Cabin', 'Embarked']


# pre-processsing pipelines

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model Pipeline

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Test/Train Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining Data Shape:", X_train.shape)
print("Test Data Shape:", X_test.shape)


Training Data Shape: (712, 8)
Test Data Shape: (179, 8)


# Train Model

In [None]:
model.fit(X_train, y_train)

# Model Evaluation

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy*100)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Accuracy: 79.88826815642457

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179


Confusion Matrix:
[[92 13]
 [23 51]]


# Hyperparameter Tuning

In [None]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
best_accuracy = grid_search.best_score_
print(f"Best Cross-Validated Accuracy: {best_accuracy*100:.2f}")


y_pred_best = best_model.predict(X_test)
print(f"Test Accuracy of Best Model: {accuracy_score(y_test, y_pred_best)*100:.2f}")


Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Best Cross-Validated Accuracy: 83.28
Test Accuracy of Best Model: 81.56


# Save Model

In [None]:
joblib.dump(grid_search.best_estimator_, 'tuned_titanic_model.pkl')
print("Model saved as 'tuned_titanic_model.pkl'")

Model saved as 'tuned_titanic_model.pkl'
