In [1]:
# data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd

# Advanced Data Visulization Tools 
import seaborn as sns
# Allows for displaying of sns plots in the notebook (backend)
%matplotlib inline

# Basic visulization tools
import matplotlib.pyplot as plt

# Prepare data for modeling 
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

# Complex splits
import re

# Various helpful model testing metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Logistic, Random Forest and Support Vector models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Load the training and test datasets
data = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Create a variable for passenger IDs in the test set
test_ids = test["PassengerId"]

In [3]:
# Looking into the categories of categorical variables

# Printing the unique sexes
print('\nUnique sex: ')
print(data['Sex'].unique())

# Printing the unique cabins
print('\nUnique Cabin: ')
print(data['Cabin'].unique())

# Printing the unique embarked ports
print('\nUnique Embarked: ')
print(data['Embarked'].unique())


Unique sex: 
['male' 'female']

Unique Cabin: 
[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C1

In [4]:
# Quick summary that shows 5 rows you can customize it by adding a number inside the parameter
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Statistics to help visualize training dataset features
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# Statistics to help visualize test dataset features
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
def clean(data):
    # Drop specified columns only if they exist in the dataset
    columns_to_drop = ["Ticket", "PassengerId", "Name", "Cabin"]
    data = data.drop([col for col in columns_to_drop if col in data.columns], axis=1)
    
    # Fill missing values for specified columns with their median
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        if col in data.columns:
            data[col] = data[col].fillna(data[col].median())
    
    # Fill missing values in 'Embarked' column with 'U'
    if 'Embarked' in data.columns:
        data['Embarked'] = data['Embarked'].fillna("U")
    
    return data

# Apply the clean function to both training and test datasets
data = clean(data)
test = clean(test)

In [9]:
# One-hot encode categorical variables
data = pd.get_dummies(data, columns=["Sex", "Embarked"])
test = pd.get_dummies(test, columns=["Sex", "Embarked"])

# Ensure both data and test have the same columns after one-hot encoding
test = test.reindex(columns=data.columns, fill_value=0)

# Remove the 'Survived' column from the test set, which was added to match the columns of the training set
test = test.drop('Survived', axis=1)

In [10]:
# Split the training data into training and validation sets
y = data["Survived"]
X = data.drop("Survived", axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=00.1, random_state=42)

# Train a Logistic Regression model
clf = LogisticRegression(random_state=0, max_iter=10000000000).fit(X_train, y_train)

In [11]:
# Make predictions on the validation set
predictions = clf.predict(X_val)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_val, predictions)
print(f"Validation Accuracy: {accuracy:.4f}")

# Make predictions on the test set
submission_preds = clf.predict(test)

Validation Accuracy: 0.8444


In [12]:
# Create a submission DataFrame
df = pd.DataFrame({"PassengerId": test_ids.values, "Survived": submission_preds})

# Save the submission DataFrame to a CSV file
df.to_csv("submission.csv", index=False)