In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240502%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240502T091900Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D9e38dc258372045af5e03109ba16bbbbb2a99b152224473527761de8c4568e70c49f5c76338a78a1ca32f6a09855536194057015e8e65a8b8a9d96f25c037639ad98435717de3abd294a089989283b8f990dedd55245a37da06303cfe1f3fb6bc52fb5114d8986f47eb8e19b8cd9aa9afefcaec244f2235226aae483f8963c72ade80beb34708db161bede48fcea989bd400518f6e123b384103a9c0172615c49bcfde2b6d76b00282c2d87008bd10339f4dadb17409397e309744391853addb4248a27ddbf0cd67cf4c2228a1c950e5c8a20be69ce8678c6ba497473dcfff515ba045100cf7c837b411f08d6332c30ae4d90a4b81591c70b32fadda4e191153'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


## Loading the dataset

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
train.head()

In [None]:
sns.countplot(train['Survived'])

In [None]:
sns.barplot(data = train, x = 'Pclass', y = 'Fare', hue = 'Survived')

## Data Preprocessing

In [None]:
train_len = len(train)
#combine two dataframes
df = pd.concat([train, test], axis=0)
df = df.reset_index(drop=True)
df.head()

In [None]:
df.tail()

In [None]:
## find the null values
df.isnull().sum()

In [None]:
df = df.drop(columns=['Cabin'], axis = 1)

In [None]:
#fill missing values using mean of that column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [None]:
df['Embarked'].mode()[0]

In [None]:
#fill missing values using mode of the categorical column
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

## Log transformation for uniform data distribution

In [None]:
sns.distplot(train['Fare'])

In [None]:
df['Fare'] = np.log(df['Fare']+1)

In [None]:
sns.distplot(df['Fare'])

In [None]:
df.head()

In [None]:
## drop unnnecessary columns
df = df.drop(columns=['Name', 'Ticket'], axis=1)
df.head()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Sex', 'Embarked']
le = LabelEncoder()

for col in cols:
    df[col] = le.fit_transform(df[col])
df.head()

## Train-Test Split



In [None]:
train = df.iloc[:train_len, :]
test = df.iloc[train_len:, :]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#input split
X = train.drop(columns=['PassengerId','Survived'], axis =1)
y = train['Survived']

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

def classify(model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)
    model.fit(X_train, y_train)
    print('Accuracy:', model.score(X_test, y_test))

    score = cross_val_score(model, X, y, cv=5)
    print(score)
    print('CV Score:', np.mean(score))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model)

In [None]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
classify(model)

Complete model training with full data

In [None]:
model = LGBMClassifier()
model.fit(X,y)

In [None]:
test.head()

In [None]:
#inpur split for test data
X_test = test.drop(columns=['PassengerId','Survived'], axis = 1)

In [None]:
X_test.head()

In [None]:
pred = model.predict(X_test)
pred

## TEST SUBMISSION

In [None]:
sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
sub.head()

In [None]:
sub['Survived'] = pred
sub['Survived'] = sub['Survived'].astype('int')


In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index= False)