/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv



In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Setup path
TRAIN_PATH = '/kaggle/input/titanic/train.csv'
TEST_PATH = '/kaggle/input/titanic/test.csv'
OUTPUT_PATH = '/kaggle/working/gender_submission.csv'
print(os.listdir('/kaggle/input/titanic'))

['train.csv', 'test.csv', 'gender_submission.csv']


In [2]:
# Visualize data
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
print("Train data:")
print(train_df.head())
print('\n\n')
print("Test data:")
print(test_df.head())

Train data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN  

In [3]:
# Preprocess data

# Transform port of embarkation to -1, 0, 1 for Southampton, Cherbourg, Queenstown
train_df['Embarked'] = train_df['Embarked'].map({'S': -1, 'C': 0, 'Q': 1})
test_df['Embarked'] = test_df['Embarked'].map({'S': -1, 'C': 0, 'Q': 1})

# Transform sex to 0, 1
train_df['Sex'] = train_df['Sex'].map({"male": 1, 'female': -1})
test_df['Sex'] = test_df['Sex'].map({"male": 1, 'female': -1})

# Get target variable
y = train_df['Survived']

X = train_df
X_test = test_df

# Fill missing values
X['Age'] = X['Age'].fillna(X['Age'].mean())
X['Fare'] = X['Fare'].fillna(X['Fare'].mean())
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].mean())
X_test['Fare'] = X_test['Fare'].fillna(X_test['Fare'].mean())

# Create pertinent features
# Family size
X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
X_test['FamilySize'] = X_test['SibSp'] + X_test['Parch'] + 1
# IsAlone
X['IsAlone'] = (X['FamilySize'] == 1).astype(int)
X_test['IsAlone'] = (X_test['FamilySize'] == 1).astype(int)
# Age group
X['AgeGroup'] = pd.cut(X['Age'], bins=[0, 12, 21, 65, 200], labels=[0, 1, 2, 3])
X_test['AgeGroup'] = pd.cut(X_test['Age'], bins=[0, 12, 18, 65, 200], labels=[0, 1, 2, 3])
# Fare group
X['FareGroup'] = pd.cut(X['Fare'], bins=[-1, 6, 12, 29, 870, 10000], labels=[0, 1, 2, 3, 4])
X_test['FareGroup'] = pd.cut(X_test['Fare'], bins=[-1, 6, 12, 29, 870, 10000], labels=[0, 1, 2, 3, 4])
# Title
X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
X['Title'] = X['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
X['Title'] = X['Title'].replace('Mlle', 'Miss')
X['Title'] = X['Title'].replace('Ms', 'Miss')
X['Title'] = X['Title'].replace('Mme', 'Mrs')
X['Title'] = X['Title'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4})
X_test['Title'] = X_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
X_test['Title'] = X_test['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
X_test['Title'] = X_test['Title'].replace('Mlle', 'Miss')
X_test['Title'] = X_test['Title'].replace('Ms', 'Miss')
X_test['Title'] = X_test['Title'].replace('Mme', 'Mrs')
X_test['Title'] = X_test['Title'].map({"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4})
# Deck
X['Deck'] = X['Cabin'].str.extract('([A-Z])', expand=False)
X['Deck'] = X['Deck'].fillna('N')
X['Deck'] = X['Deck'].map({'N': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8})
X_test['Deck'] = X_test['Cabin'].str.extract('([A-Z])', expand=False)
X_test['Deck'] = X_test['Deck'].fillna('N')
X_test['Deck'] = X_test['Deck'].map({'N': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8})

# Get rid of unnecessary columns
X = X.drop(['Survived', 'Name', 'Ticket', 'Embarked', 'Cabin'], axis=1)
X_test = X_test.drop(['Name', 'Ticket', 'Embarked', 'Cabin'], axis=1)

# Make sure there are no missing values and all values are numerical
print("Train data:")
print(X.info())
print('\n\n')
print("Test data:")
print(X_test.info())

# Make sure there are no empty / NaN values
print("Train data:")
print(X.isnull().sum())
print('\n\n')
print("Test data:")
print(X_test.isnull().sum())


# Number of features
input_dim = X.shape[1]
print("Input dimensions: %s" % (input_dim))

# Train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Train data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Pclass       891 non-null    int64   
 2   Sex          891 non-null    int64   
 3   Age          891 non-null    float64 
 4   SibSp        891 non-null    int64   
 5   Parch        891 non-null    int64   
 6   Fare         891 non-null    float64 
 7   FamilySize   891 non-null    int64   
 8   IsAlone      891 non-null    int64   
 9   AgeGroup     891 non-null    category
 10  FareGroup    891 non-null    category
 11  Title        891 non-null    int64   
 12  Deck         891 non-null    int64   
dtypes: category(2), float64(2), int64(9)
memory usage: 78.8 KB
None



Test data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  --

In [4]:
# Define KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn.fit(X_train, y_train)

# Predict on validation data
val_predictions = knn.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.6648


In [5]:
# Predict on test data
predictions = knn.predict(X_test)

# Save predictions to csv
output_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
output_df.to_csv(OUTPUT_PATH, index=False)