In [None]:
cd /kaggle/input/titanic

In [None]:
#Importing required libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import warnings 
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
#from mlxtend.plotting import plot_learning_curves
import plotly.express as px
import statsmodels.api as sm

In [None]:
titanic = pd.read_csv("train.csv")    # loading the data    
titanic = titanic.replace(['unknown'],np.nan)      # replacing the null values with NAN
titanic                                            # reading the data

In [None]:
#checking the dimensions of data
titanic.shape

In [None]:
#charecteristics of the data
titanic.info()

In [None]:
# converting the data types of columns
titanic = titanic.astype({"PassengerId":"category","Survived":"category","Pclass":"category","Name":"category","Ticket":"category","Cabin":"category","Sex":"category","Embarked":"category"})
titanic.dtypes

In [None]:
#describing the data
titanic.describe().T

In [None]:
# % of missing values for all columns
percent_missing = titanic.isnull().sum() * 100 / len(titanic)
percent_missing.round(2)

In [None]:
#checking for duplicated records
titanic.duplicated().sum()

In [None]:
# frequency for each level within the target variable
titanic.Survived.value_counts(normalize =True)*100

In [None]:
# countplot for survived(target variable)
sns.countplot(x='Survived', hue='Survived', data=titanic)

In [None]:
# countplot for pclass
sns.countplot(x='Pclass', hue='Survived', data=titanic)

In [None]:
# countplot for Sex
sns.countplot(x='Sex', hue='Survived', data=titanic)

In [None]:
# distibution plot for age
sns.distplot(titanic["Age"])

In [None]:
# pie plot for SibSp
titanic.groupby(['SibSp']).count().plot(kind='pie', y='Survived', autopct='%1.0f%%',figsize=(20,10))

In [None]:
# pie plot for Parch
titanic.groupby(['Parch']).count().plot(kind='pie', y='Survived', autopct='%1.0f%%',figsize=(20,10))

In [None]:
# Dropping the unneccessary columns
titanic = titanic.drop(["PassengerId","Name","Ticket","Cabin","Fare"], axis = 1)

In [None]:
# After dropping
titanic

In [None]:
cat_cols = ["Pclass","Sex","Embarked"]
num_cols = ["Age","SibSp","Parch"]

In [None]:
print(cat_cols)
print(num_cols)

In [None]:
#boxplot for all the numerical attributes in dataframe (other type)
titanic.boxplot(num_cols, grid = False)

In [None]:
# dividing the independent variables and depandent variable
X = titanic.drop(["Survived"], axis = 1)
y = titanic["Survived"]

In [None]:
# reading X
X

In [None]:
# reading y
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify = y,test_size=0.3,random_state=107)

In [None]:
print("X_train is:", X_train.shape)
print("X_val is:", X_test.shape)
print("X_train is:", y_train.shape)
print("X_val is:", y_test.shape)

In [None]:
# checking value_counts whether y_train is splitted according to y( balanced or imbalanced)
y_train.value_counts(normalize = True)

In [None]:
# checking value_counts whether y_test is splitted according to y( balanced or imbalanced)
y_test.value_counts(normalize = True)

In [None]:
# checking for null values
titanic.isnull().sum()

In [None]:
# importing Simple Imputer to impute null values
from sklearn.impute import SimpleImputer

In [None]:
# fitting the X_train[num_cols] in imputer model
num_cols_imputer=SimpleImputer(strategy='median')
num_cols_imputer = num_cols_imputer.fit(X_train[num_cols])

In [None]:
# fitting the X_train[num_cols] and X_test[num_cols] in imputer model
X_train[num_cols] = num_cols_imputer.transform(X_train[num_cols])
X_test[num_cols] = num_cols_imputer.transform(X_test[num_cols])

In [None]:
# checking null values after imputing X_train
X_train.isnull().sum()

In [None]:
# checking null values after imputing X_test
X_test.isnull().sum()

In [None]:
# fitting the X_train[cat_cols] in imputer model
cat_cols_imputer=SimpleImputer(strategy='most_frequent')
cat_cols_imputer = cat_cols_imputer.fit(X_train[cat_cols])

In [None]:
# fitting the X_train[cat_cols] and X_test[cat_cols] in imputer model
X_train[cat_cols] = cat_cols_imputer.transform(X_train[cat_cols])
X_test[cat_cols] = cat_cols_imputer.transform(X_test[cat_cols])

In [None]:
# checking null values after imputing X_train
X_train.isnull().sum()

In [None]:
# checking null values after imputing X_test
X_test.isnull().sum()

In [None]:
# fitting the X_train[num_cols] in standardization model
scaler = StandardScaler()
scaler = scaler.fit(X_train[num_cols])

In [None]:
# fitting the X_train[num_cols] and X_test[num_cols] in standardization model
X_train_std = pd.DataFrame(scaler.transform(X_train[num_cols]), columns = X_train[num_cols].columns)
X_test_std = pd.DataFrame(scaler.transform(X_test[num_cols]), columns = X_test[num_cols].columns)

In [None]:
# reading X_train_std after standardization
X_train_std

In [None]:
# reading X_test_std after standardization
X_test_std

In [None]:
# resetting the index to concat correctly
X_train_std = X_train_std.reset_index()
X_train_std

In [None]:
# dropping the index column 
X_train_std = X_train_std.drop(["index"], axis = 1)
X_train_std

In [None]:
# resetting the index to concat correctly
X_test_std = X_test_std.reset_index()
X_test_std

In [None]:
# dropping the index column 
X_test_std = X_test_std.drop(["index"], axis = 1)
X_test_std

In [None]:
# creationg dummies for cat_cols from X_train
X_train_dummies = pd.get_dummies(X_train[cat_cols],drop_first = True)

In [None]:
# reading X_train_dummies
X_train_dummies

In [None]:
# creationg dummies for cat_cols from X_test
X_test_dummies = pd.get_dummies(X_test[cat_cols],drop_first = True)

In [None]:
# reading X_test_dummies
X_test_dummies

In [None]:
# resetting the index to concat correctly
X_train_dummies = X_train_dummies.reset_index()
X_train_dummies

In [None]:
# dropping the index column 
X_train_dummies = X_train_dummies.drop(["index"], axis = 1)
X_train_dummies

In [None]:
# resetting the index to concat correctly
X_test_dummies = X_test_dummies.reset_index()
X_test_dummies

In [None]:
# dropping the index column 
X_test_dummies = X_test_dummies.drop(["index"], axis = 1)
X_test_dummies

In [None]:
# Concating the X_train_std, X_train_dummies as final_X_train_data
final_X_train_data = pd.concat([X_train_std,X_train_dummies], axis = 1)

In [None]:
# reading final_X_train_data
final_X_train_data

In [None]:
# Concating the X_test_std, X_test_dummies as final_X_test_data
final_X_test_data = pd.concat([X_test_std,X_test_dummies], axis = 1)

In [None]:
# reading final_X_test_data
final_X_test_data

In [None]:
# checking for null values for final_X_train_data (to check whether nulls is formed or not after concating)
final_X_train_data.isnull().sum()

In [None]:
# checking for null values for final_X_test_data (to check whether nulls is formed or not after concating)
final_X_test_data.isnull().sum()

# MODEL BUILDING

## SVM

In [None]:
# model building by SVM algorithm
svm_model_1 = SVC(kernel="linear", C = 0.01)
svm_model_1 = svm_model_1.fit(final_X_train_data,y_train)
svm_model_1

In [None]:
# predicting the y value for the train data
svm_model_1_y_train_preds = svm_model_1.predict(final_X_train_data)
svm_model_1_y_train_preds[0:10]

In [None]:
# predicting the y value for the test data
svm_model_1_y_test_preds = svm_model_1.predict(final_X_test_data)
svm_model_1_y_test_preds[0:10]

In [None]:
# import libraries of confusion_matrix and accuracy and recall
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,recall_score

In [None]:
# finding the confusion_matrix for y_train actuals and y_train_predicts
confusionmatrix = confusion_matrix(y_train,svm_model_1_y_train_preds)
confusionmatrix

In [None]:
# finding the confusion_matrix for y_test actuals and y_test_predicts
confusionmatrix = confusion_matrix(y_test,svm_model_1_y_test_preds)
confusionmatrix

In [None]:
# finding the accuracy for y_train_actuals and y_train_predicts
accuracy_score(y_train,svm_model_1_y_train_preds)

In [None]:
# finding the accuracy for y_test_actuals and y_test_predicts
accuracy_score(y_test,svm_model_1_y_test_preds)

## DECISION TREE

In [None]:
# model building by Decision Tree algorithm
tree_model = DecisionTreeClassifier(max_depth=4, criterion = "gini")
tree_model = tree_model.fit(final_X_train_data,y_train)
tree_model

In [None]:
# predicting the y value for the train data
tree_model_y_train_preds = tree_model.predict(final_X_train_data)
tree_model_y_train_preds.shape

In [None]:
# predicting the y value for the test data
tree_model_y_test_preds = tree_model.predict(final_X_test_data)
tree_model_y_test_preds[0:10]

In [None]:
# finding the confusion_matrix for y_train actuals and y_train_predicts
confusionmatrix = confusion_matrix(y_train,tree_model_y_train_preds)
confusionmatrix

In [None]:
# finding the confusion_matrix for y_test_actuals and y_test_predicts
confusionmatrix = confusion_matrix(y_test,tree_model_y_test_preds)
confusionmatrix

In [None]:
# finding the accuracy for y_train_actuals and y_train_predicts
accuracy_score(y_train,tree_model_y_train_preds)

In [None]:
# finding the accuracy for y_test_actuals and y_test_predicts
accuracy_score(y_test,tree_model_y_test_preds)

## RANDOM FOREST

In [None]:
# model building by Random Forest algorithm
rfc_model_1 = RandomForestClassifier(n_estimators=30, max_depth=4, criterion = "gini")
rfc_model_1 = rfc_model_1.fit(final_X_train_data,y_train)
rfc_model_1

In [None]:
# predicting the y value for the train data
rfc_model_1_y_train_preds = rfc_model_1.predict(final_X_train_data)
rfc_model_1_y_train_preds[0:10]

In [None]:
# predicting the y value for the test data
rfc_model_1_y_test_preds = rfc_model_1.predict(final_X_test_data)
rfc_model_1_y_test_preds[0:10]

In [None]:
# finding the confusion_matrix for y_train actuals and y_train_predicts
confusionmatrix = confusion_matrix(y_train,rfc_model_1_y_train_preds)
confusionmatrix

In [None]:
# finding the confusion_matrix for y_test_actuals and y_test_predicts
confusionmatrix = confusion_matrix(y_test,rfc_model_1_y_test_preds)
confusionmatrix

In [None]:
# finding the accuracy for y_train_actuals and y_train_predicts
accuracy_score(y_train,rfc_model_1_y_train_preds)

In [None]:
# finding the accuracy for y_test_actuals and y_test_predicts
accuracy_score(y_test,rfc_model_1_y_test_preds)

## TESTING

In [None]:
titanic_2 = pd.read_csv("test.csv")    # loading the data    
titanic_2 = titanic_2.replace(['unknown'],np.nan)   # replacing the null values with NAN
titanic_2 

In [None]:
#checking the dimensions of data
titanic_2.shape

In [None]:
#charecteristics of the data
titanic_2.info()

In [None]:
# converting the data types of columns
titanic_2 = titanic_2.astype({"PassengerId":"category","Pclass":"category","Name":"category","Ticket":"category","Cabin":"category","Sex":"category","Embarked":"category"})
titanic_2.dtypes

In [None]:
#describing the data
titanic.describe().T

In [None]:
#checking for null values
titanic_2.isnull().sum()

In [None]:
#checking for duplicated records
titanic_2.duplicated().sum()

In [None]:
# distibution plot for age
sns.distplot(titanic_2["Age"])

In [None]:
# pie plot for SibSp
titanic_2.groupby(['Pclass']).count().plot(kind='pie', y='SibSp', autopct='%1.0f%%',figsize=(20,10))

In [None]:
# pie plot for SibSp
titanic_2.groupby(['Sex']).count().plot(kind='pie', y='Pclass', autopct='%1.0f%%',figsize=(20,10))

In [None]:
# filtering the PassengerId column
labels = titanic_2.filter(["PassengerId"], axis =1)
labels

In [None]:
# Dropping the unneccessary columns
titanic_2 = titanic_2.drop(["PassengerId","Name","Ticket","Cabin","Fare"], axis = 1)

In [None]:
# After dropping
titanic_2

In [None]:
# seperating the categorical and numerical columns
cat_cols = ["Pclass","Sex","Embarked"]
num_cols = ["Age","SibSp","Parch"]

In [None]:
# printing the categorical amd numerical columns
print(cat_cols)
print(num_cols)

In [None]:
# checking for null values
titanic_2.isnull().sum()

In [None]:
# fitting the titanic_2[num_cols] and titanic_2[num_cols] in imputer model
titanic_2[num_cols] = num_cols_imputer.transform(titanic_2[num_cols])

In [None]:
# checking null values after imputing X_train
titanic_2.isnull().sum()

In [None]:
# fitting the titanic[num_cols] and titanic_2[num_cols] in standardization model
titanic_2_std = pd.DataFrame(scaler.transform(titanic_2[num_cols]), columns = titanic_2[num_cols].columns)

In [None]:
# reading titanic_2 after standardization
titanic_2_std# resetting the index to concat correctly
titanic_2_std = titanic_2_std.reset_index()
titanic_2_std

In [None]:
# dropping the index column 
titanic_2_std = titanic_2_std.drop(["index"], axis = 1)
titanic_2_std

In [None]:
# creationg dummies for cat_cols from titanic_2
titanic_2_dummies = pd.get_dummies(titanic_2[cat_cols],drop_first = True)

In [None]:
# reading titanic_2_dummies
titanic_2_dummies

In [None]:
# resetting the index to concat correctly
titanic_2_dummies = titanic_2_dummies.reset_index()
titanic_2_dummies

In [None]:
# dropping the index column 
titanic_2_dummies = titanic_2_dummies.drop(["index"], axis = 1)
titanic_2_dummies

In [None]:
# Concating the titanic_2_std, titanic_2_dummies as final_titanic_2
final_titanic_2 = pd.concat([titanic_2_std,titanic_2_dummies], axis = 1)

In [None]:
# calling the new data(final_titanic_2)
final_titanic_2

In [None]:
# checking for null values for final_X_train_data (to check whether nulls is formed or not after concating)
final_titanic_2.isnull().sum()

In [None]:
# predicting the y value for the new data(final_tiatnic_2)
y_new_preds = svm_model_1.predict(final_titanic_2)
y_new_preds[0:10]

In [None]:
# checking the length for predicted column
len(y_new_preds)

In [None]:
# combing the predicted values with the passangerId which is filtered at starting
labels["Survived"] = y_new_preds
labels["Survived"][0:10]

In [None]:
# setting the passangerId as the index
labels = labels.set_index(["PassengerId"])
labels

In [None]:
# converting the final data i.e passangerId and surevived(predicts) into CSV file
labels.to_csv("/kaggle/working/submission.csv")