In [1]:
#Load the dataset into the Python environment
import pandas as pd

# Reading the data set
titanic_data = pd.read_csv("D:/ictak_dsa/titanic_dataset .csv")

# Display the first few rows of the dataset
print(titanic_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [2]:
# Pre-processing steps
# Check for missing values
print(titanic_data.isnull().sum())

# Handle missing values 
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)
titanic_data.drop(['Cabin'], axis=1, inplace=True)  # Assuming you want to drop the 'Cabin' column

# Convert categorical variables to numerical
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'])

# Feature selection
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
X = titanic_data[features]
y = titanic_data['Survived']

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [8]:
#Create kNN and SVM models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create kNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Create SVM model
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_scaled, y_train)


SVC(C=1, kernel='linear')

In [9]:
# Perform k-fold and stratified k-fold cross-validation
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold

# Perform cross-validation and print average accuracy
def cross_val_and_print(model, X, y, cv):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f'Average Accuracy: {scores.mean()}')

# k-fold cross-validation for kNN
print("kNN Model:")
cross_val_and_print(knn_model, X_train_scaled, y_train, KFold(n_splits=5, shuffle=True, random_state=42))

# Stratified k-fold cross-validation for kNN
print("Stratified kNN Model:")
cross_val_and_print(knn_model, X_train_scaled, y_train, StratifiedKFold(n_splits=5, shuffle=True, random_state=42))

# k-fold cross-validation for SVM
print("SVM Model:")
cross_val_and_print(svm_model, X_train_scaled, y_train, KFold(n_splits=5, shuffle=True, random_state=42))

# Stratified k-fold cross-validation for SVM
print("Stratified SVM Model:")
cross_val_and_print(svm_model, X_train_scaled, y_train, StratifiedKFold(n_splits=5, shuffle=True, random_state=42))


kNN Model:
Average Accuracy: 0.7978331527627303
Stratified kNN Model:
Average Accuracy: 0.7920811582783414
SVM Model:
Average Accuracy: 0.7879346006106569
Stratified SVM Model:
Average Accuracy: 0.7879247513050329
