# Titanic prediction with KNN

*import libraries*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**reading csv file**

In [2]:
df = pd.read_csv('../input/traintitanicknn/train.csv')

### getting general information about data

In [3]:
df.head(2)

In [4]:
df.info()

In [5]:
df.describe()

## plotting for more exploration

In [6]:
sns.pairplot(data=df)

**correlations**

In [7]:
df.corr()

In [8]:
sns.displot(df['Survived'], kde=True)

In [9]:
sns.displot(df['Age'], kde=True)

In [10]:
sns.scatterplot(data=df, x=df['Age'], y=df['Pclass'], hue=df['Survived'])

In [11]:
sns.scatterplot(data=df, x=df['Age'], y=df['Survived'], hue=df['Pclass'])

In [12]:
sns.countplot(df['Survived'], hue=df['Pclass'])

In [13]:
df.shape

**creating a function to calculate null values**

In [14]:
def null_counter(df):
    null_percent = df.isna().sum() / len(df) * 100
    return null_percent

In [15]:
null = null_counter(df)
null

#### dropping columns

In [16]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

#### fill age columns `nan` values

In [17]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

drop `missing values`

In [18]:
df = df.dropna()

In [19]:
df.head()

### preprocessing Data

In [20]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [21]:
transformer = make_column_transformer(
    (MinMaxScaler(), ['Fare', 'Parch', 'SibSp', 'Pclass', 'Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Sex', 'Embarked'])
)

**X and y**

In [22]:
X = df.drop('Survived', axis=1)
y = df['Survived']

**train-test-split**

In [23]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.15, random_state=42)

In [24]:
transformer.fit(train_x)

In [25]:
train_x = transformer.transform(train_x)
test_x = transformer.transform(test_x)

## Build, train and evaluate Model

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

### creating model with diffrent number of ks to find optimal number of neighbors

In [27]:
train_scores = []
test_scores = []
for k in range(1, 10):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(train_x, train_y)
    train_scores.append(metrics.accuracy_score(train_y, neigh.predict(train_x)))
    test_scores.append(metrics.accuracy_score(test_y, neigh.predict(test_x)))

In [28]:
train_scores

In [29]:
test_scores

In [30]:
scores_dict = {
    'Train Score':train_scores,
    'Test Score':test_scores
}

pd.DataFrame(scores_dict)

In [31]:
plt.scatter(x=train_scores, y=test_scores, color='b')
plt.xlabel('train score')
plt.ylabel('test score')

### Final Model

In [33]:
final_model = KNeighborsClassifier(n_neighbors=4)
final_model.fit(train_x, train_y)
yhat_final = final_model.predict(test_x)
yhat_train_final = final_model.predict(train_x)

#### evaluating **`Final Model`**

In [34]:
# accuracy score
metrics.accuracy_score(test_y, yhat_final)

In [35]:
# recall
metrics.recall_score(test_y, yhat_final)

In [37]:
# precision
metrics.precision_score(test_y, yhat_final)

In [39]:
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

In [40]:
plot_confusion_matrix(final_model, test_x, test_y)

In [41]:
plot_precision_recall_curve(final_model, test_x, test_y)

In [42]:
plot_roc_curve(final_model, test_x, test_y)