In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
titanic = pd.read_csv('/kaggle/input/titanic/train.csv')

In [None]:
titanic.head()

In [None]:
numeric_features = ["Pclass","Age","SibSp","Parch","Fare"]
categoric_features = ["Sex","Ticket","Embarked"]
features = ["Pclass","Age","SibSp","Parch","Fare","Sex","Ticket","Embarked"]
label = "Survived"

In [None]:
titanic = titanic.drop(columns=["Name","Cabin"])

In [None]:
titanic.isnull().sum()

In [None]:
titanic["Age"].fillna(titanic["Age"].mean(), inplace = True) 

In [None]:
titanic = titanic.dropna()

In [None]:
numerical = [0,1,2,3,4]
categorical = [5,6,7]

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

for col in numeric_features:
    titanic.boxplot(column=col, by=label, figsize=(6,6))
    plt.title(col)
plt.show()

In [None]:
for col in categoric_features:
    counts = titanic[col].value_counts().sort_index()
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    counts.plot.bar(ax = ax, color='steelblue')
    ax.set_title(col + ' counts')
    ax.set_xlabel(col) 
    ax.set_ylabel("Frequency")
plt.show()

In [None]:
for col in categoric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    titanic.boxplot(column = label, by = col, ax = ax)
    ax.set_title('Label by ' + col)
    ax.set_ylabel("Survived")
plt.show()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

In [None]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])


In [None]:
reg = 0.01
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('logregressor', LogisticRegression(C=1/reg, solver="liblinear"))])

In [None]:
new_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('logregressor', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0))])

In [None]:
X, y = titanic[numeric_features+categoric_features].values, titanic[label].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

print ('Training cases: %d\nTest cases: %d' % (X_train.shape[0], X_test.shape[0]))

In [None]:
model = pipeline.fit(X_train, (y_train))
print (model)

In [None]:
new_model = new_pipeline.fit(X_train,(y_train))

In [None]:
predictions = model.predict(X_test)

In [None]:
new_predictions = new_model.predict(X_test)

In [None]:
print('Predicted labels: ', predictions)
print('Actual labels:    ' ,y_test)

In [None]:
print('Predicted labels: ', new_predictions)
print('Actual labels:    ' ,y_test)

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy: ', accuracy_score(y_test, predictions))

In [None]:
print('Accuracy: ', accuracy_score(y_test, new_predictions))

In [None]:
from sklearn. metrics import classification_report

print(classification_report(y_test, predictions))

In [None]:

from sklearn.metrics import precision_score, recall_score

print("Overall Precision:",precision_score(y_test, predictions))
print("Overall Recall:",recall_score(y_test, predictions))

In [None]:
import joblib

# Save the model as a pickle file
filename = '/kaggle/working/titanic.pkl'
joblib.dump(model, filename)

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
test_data.head()

In [None]:
test_data = test_data.drop(columns=["Name","Cabin"])

In [None]:
test_data.isnull().sum()

In [None]:
test_data["Age"].fillna(test_data["Age"].mean(), inplace = True) 

In [None]:
test_data["Fare"].fillna(test_data["Fare"].mean(),inplace= True)

In [None]:
X_train

In [None]:
test_X = test_data[numeric_features+categoric_features].values

In [None]:
predict_val = model.predict(test_X)

In [None]:
result = []
for i,k in test_data.iterrows():
    result.append([k["PassengerId"],predict_val[i]])

In [None]:
df = pd.DataFrame(result,columns=["PassengerId","Survived"])

In [None]:
df.to_csv('/kaggle/working/output.csv',index=False)