In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data=train_data.drop(labels=['PassengerId','Name','Ticket'], axis=1)
train_data.info()
test_data=test_data.drop(labels=['PassengerId','Name','Ticket'], axis=1)

In [None]:
#convert cabin to deck -> label int

test_data['Cabin'].fillna('U0', inplace=True)
train_data['Cabin'].fillna('U0', inplace=True)
test_data['Deck']= test_data['Cabin'].str.extract('([A-Za-z]+)')
train_data['Deck']= train_data['Cabin'].str.extract('([A-Za-z]+)')
#test U and T value
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8, "T": 8}
test_data['Deck'] = test_data['Deck'].map(deck)
test_data['Deck'] = test_data['Deck'].astype(int)
train_data['Deck'] = train_data['Deck'].map(deck)
train_data['Deck'] = train_data['Deck'].astype(int)

train_data=train_data.drop(labels='Cabin', axis=1)
test_data=test_data.drop(labels='Cabin', axis=1)

In [None]:
#rel not needed
# train_data['Rel'] = train_data['SibSp']+train_data['Parch']
train_data=train_data.drop(labels='SibSp', axis=1)
train_data=train_data.drop(labels='Parch', axis=1)
# test_data['Rel'] = test_data['SibSp']+test_data['Parch']
test_data=test_data.drop(labels='SibSp', axis=1)
test_data=test_data.drop(labels='Parch', axis=1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corrMatrix = train_data.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
y_train=train_data.iloc[:,0:1].values
x_train=train_data.iloc[:,1:].values
x_test=test_data.iloc[:,:].values

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
x_train[:,1]=le.fit_transform(x_train[:,1])
x_test[:,1]=le.transform(x_test[:,1])

In [None]:
from sklearn.impute import KNNImputer
knn1 = KNNImputer(n_neighbors=5, weights='uniform')
x_train[:, 2:3]=knn1.fit_transform(x_train[:, 2:3]) #age
x_test[:, 2:3]=knn1.transform(x_test[:, 2:3])

from sklearn.impute import SimpleImputer
si1 = SimpleImputer(missing_values=np.nan, strategy='mean')
x_train[:, 3:4] = si1.fit_transform(x_train[:, 3:4]) #fare
x_test[:, 3:4] = si1.transform(x_test[:, 3:4])


from sklearn.impute import SimpleImputer
si2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
x_train[:, 4:5] = si2.fit_transform(x_train[:, 4:5]) #embarked
x_test[:, 4:5] = si2.transform(x_test[:, 4:5])


In [None]:
#fare scaled
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,3:4] = sc.fit_transform(x_train[:,3:4])
x_test[:,3:4] = sc.transform(x_test[:,3:4])

In [None]:
#encode embarked
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
x_train[:,4:5]  = x_train[:,4:5].astype('str')
ct = ColumnTransformer([('encoder', OneHotEncoder(), [4])], remainder='passthrough')
x_train=ct.fit_transform(x_train)
x_test=ct.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(max_depth=80, max_features=2, min_samples_leaf=4,
                        min_samples_split=8, n_estimators= 100)

classifier.fit(x_train, y_train)


In [None]:
pred_train=classifier.predict(x_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(x_train)
cm = confusion_matrix(y_train, pred_train)
print(cm)
accuracy_score(y_train, pred_train)

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
imp = {
  "embarked1": classifier.feature_importances_[0]*100,
  "embarked2": classifier.feature_importances_[1]*100,
  "embarked3": classifier.feature_importances_[2]*100,
    "class": classifier.feature_importances_[3]*100,
    "gender": classifier.feature_importances_[4]*100,
    "Age": classifier.feature_importances_[5]*100,
    "Fare": classifier.feature_importances_[6]*100,
    "Deck": classifier.feature_importances_[7]*100,
}

In [None]:
plt.figure(figsize=(15, 7))
plt.bar(*zip(*imp.items()), )

In [None]:
sns.factorplot('Pclass', 'Fare', data=train_data, aspect=2)

In [None]:
# #tuning params
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100,110,120,130]
# }
# rf = RandomForestClassifier()
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 2)

# grid_search.fit(x_train, y_train)
# grid_search.best_params_
# best_grid = grid_search.best_estimator_
# best_grid

In [None]:
results=classifier.predict(x_test)
results

In [None]:
result_data = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
result_data

In [None]:
output = pd.DataFrame({'PassengerId': result_data.PassengerId, 'Survived': results})
output.to_csv('my_submission.csv', index=False)