In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Import Modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Loading the dataset

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
train.head()

In [None]:
## statistical info
train.describe()

In [None]:
## datatype info
train.info()

## Exploratory Data Analysis

In [None]:
## categorical attributes
sns.countplot(train['Survived'])

In [None]:
sns.countplot(train['Pclass'])

In [None]:
sns.countplot(train['Sex'])

In [None]:
sns.countplot(train['SibSp'])

In [None]:
sns.countplot(train['Parch'])

In [None]:
sns.countplot(train['Embarked'])

In [None]:
## numerical attributes
sns.distplot(train['Age'])

In [None]:
sns.distplot(train['Fare'])

In [None]:
class_fare = train.pivot_table(index='Pclass', values='Fare')
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Avg. Fare')
plt.xticks(rotation=0)
plt.show()

In [None]:
class_fare = train.pivot_table(index='Pclass', values='Fare', aggfunc=np.sum)
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Total Fare')
plt.xticks(rotation=0)
plt.show()

In [None]:
sns.barplot(data=train, x='Pclass', y='Fare', hue='Survived')

In [None]:
sns.barplot(data=train, x='Survived', y='Fare', hue='Pclass')

## Data Preprocessing

In [None]:
train_len = len(train)
# combine two dataframes
df = pd.concat([train, test], axis=0)
df = df.reset_index(drop=True)
df.head()

In [None]:
df.tail()

In [None]:
## find the null values
df.isnull().sum()

In [None]:
# drop or delete the column
df = df.drop(columns=['Cabin'], axis=1)

In [None]:
df['Age'].mean()

In [None]:
# fill missing values using mean of the numerical column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [None]:
df['Embarked'].mode()[0]

In [None]:
# fill missing values using mode of the categorical column
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

## Log transformation for uniform data distribution

In [None]:
sns.distplot(df['Fare'])

In [None]:
df['Fare'] = np.log(df['Fare']+1)

In [None]:
sns.distplot(df['Fare'])

## Correlation Matrix

In [None]:
corr = df.corr()
plt.figure(figsize=(15, 9))
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [None]:
df.head()

In [None]:
## drop unnecessary columns
df = df.drop(columns=['Name', 'Ticket'], axis=1)
df.head()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Sex', 'Embarked']
le = LabelEncoder()

for col in cols:
    df[col] = le.fit_transform(df[col])
df.head()

## Train-Test Split

In [None]:
train = df.iloc[:train_len, :]
test = df.iloc[train_len:, :]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# input split
X = train.drop(columns=['PassengerId', 'Survived'], axis=1)
y = train['Survived']

In [None]:
X.head()

## Model Training

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
# classify column
def classify(model):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print('Accuracy:', model.score(x_test, y_test))
    
    score = cross_val_score(model, X, y, cv=5)
    print('CV Score:', np.mean(score))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
classify(model)

In [None]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
classify(model)

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(verbose=0)
classify(model)

## Complete Model Training with Full Data

In [None]:
model = LGBMClassifier()
model.fit(X, y)

In [None]:
test.head()

In [None]:
# input split for test data
X_test = test.drop(columns=['PassengerId', 'Survived'], axis=1)

In [None]:
X_test.head()

In [None]:
pred = model.predict(X_test)
pred

## Test Submission

In [None]:
sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
sub.head()

In [None]:
sub.info()

In [None]:
sub['Survived'] = pred
sub['Survived'] = sub['Survived'].astype('int')

In [None]:
sub.info()

In [None]:
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)