## IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import dtale 
import matplotlib.pyplot as plt 
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import catboost as cb 

### LOADING DATASET

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

df=test['PassengerId']

train.head()

In [None]:
train.shape

In [None]:
train.info()

### HEATMAP

In [None]:
plt.subplots(figsize=(10,10))
sns.heatmap(train.corr(),annot=True)

#### Correlation
Positive corr ->  Fare and Survived (Passenger who paid more are more likely to survive)

Negative corr ->  Gender and Survived  , Age and PClass

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False)

### AGE AND PCLASS REALTION

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x='Pclass',y='Age',data=train)

In [None]:
def put_age(col):
    age=col[0]
    pclass=col[1]
    
    if pd.isnull(age):
        if pclass==1:
            return 37
        elif pclass==2:
            return 29
        else:
            return 24
            
    else:
        return age

In [None]:
train['Age']=train[['Age','Pclass']].apply(put_age,axis=1)

### CABIN

In [None]:
train.drop(['Cabin'],axis=1,inplace=True)

In [None]:
sns.heatmap(train.isnull(),yticklabels=False,cbar=False)

### EMBARKED

In [2]:
sns.countplot(y='Embarked',data=train)

NameError: name 'sns' is not defined

In [None]:
train['Embarked']=train['Embarked'].fillna('S')

In [None]:
train.head()

### SIBSP,PARCH,TICKET,NAME,PASSENGERID

In [None]:
train["FamilySize"]=train['SibSp']+train["Parch"]+1
test["FamilySize"]=test['SibSp']+test["Parch"]+1

train.drop(['Ticket','SibSp','Parch','Name','PassengerId'],axis=1,inplace=True)
test.drop(['Ticket','SibSp','Parch','Name','PassengerId'],axis=1,inplace=True)

sc=StandardScaler()
train[['Age','Fare']]=sc.fit_transform(train[['Age','Fare']])
test[['Age','Fare']]=sc.fit_transform(test[['Age','Fare']])
train.head()

In [None]:
sex=pd.get_dummies(train['Sex'],drop_first=True)
embark=pd.get_dummies(train['Embarked'],drop_first=True)
pclass=pd.get_dummies(train['Pclass'],drop_first=True)

train.drop(['Sex','Embarked','Pclass'],axis=1,inplace=True)
train=pd.concat([train,sex,embark,pclass],axis=1)
train.head()

### INPUT AND TARGET 

In [None]:
X=train.drop('Survived',axis=1)
y=train['Survived'].astype(int)

### HYPER-PARAMETER TUNING USING GRIDSEARCHCV

In [1]:
model_param ={
    'LogisticRegression':{
        'model':LogisticRegression(),
        'param':{
            'solver':['newton-cg','liblinear','saga','lbfgs']
        }
    },
    
    'KNeighborsClassifier':{
        'model': KNeighborsClassifier(),
        'param':{
            'n_neighbors':[5,10,15,20,25]
        }
    },
    
    'SVC':{
        'model':SVC(gamma='auto'),
        'param':{
             'C':[0.1,1,10,100] 
        }
    },
    
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'param':{
            'criterion':['gini','entropy']
        }
    },
    
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators':[5,7,9,11,13,15,17]
        }
    },
    
    'CatBoostClassifier':{
        'model':cb.CatBoostClassifier(),
        'param' :{
          'iterations': [5, 10, 15, 20, 25, 50, 100],
          'learning_rate': [0.01, 0.05, 0.1],
          'depth': [3, 5, 7, 9, 11, 13],
        }
    }
}

NameError: name 'LogisticRegression' is not defined

In [None]:
scores=[]
kf=KFold(n_splits=10,shuffle=True,random_state=0)
for model_name,m in model_param.items():
    model_select=GridSearchCV(estimator=m['model'],param_grid=m['param'],cv=kf,return_train_score=False)
    model_select.fit(X,y)
    scores.append({
        'model': model_name,
        'best_score': model_select.best_score_,
        'best_params': model_select.best_params_
    })

### MODEL AND THEIR ACCURACIES

model_score_df=pd.DataFrame(scores,columns=['model','best_score','best_params'])
model_score_df

### MODEL WITH BEST ACCURACY - SVC 

In [None]:
model_svc=SVC(gamma='auto',C=1)
model_svc.fit(X,y)

In [3]:
test.head()

NameError: name 'test' is not defined

### TEST DATASET 

In [None]:
sex=pd.get_dummies(test['Sex'],drop_first=True)
embark=pd.get_dummies(test['Embarked'],drop_first=True)
pclass=pd.get_dummies(test['Pclass'],drop_first=True)

test.drop(['Sex','Embarked','Pclass','Cabin'],axis=1,inplace=True)
test=pd.concat([test,sex,embark,pclass],axis=1)
test.head()

In [None]:
test.isnull().sum()

In [None]:
test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Fare'].fillna(test['Fare'].mean(),inplace=True)
test.head()

### FINAL PREDICTION ON TEST DATA

In [None]:
y_pred=model_svc.predict(test)

In [None]:
submission=pd.DataFrame({
    "PassengerId":df,
    "Survived":y_pred
})

In [None]:
submission.to_csv('titanic_submission_1.csv',index=False)