# **importing libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# **Import dataset**

In [None]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
test_df=pd.read_csv('/kaggle/input/titanic/test.csv')

# **Data Preprocessing and EDA(Exploratory data analysis)**

In [None]:
### Check train dataset head values
train.head()

In [None]:
## Check dimension of the train and test dataset
print(train.shape)
print(test_df.shape)

In [None]:
###check train dataset info
train.info()

In [None]:
### print 5 point summary of our train dataset
train.describe()

In [None]:
##check info about the gender counts
train['Sex'].value_counts()

In [None]:
### droping unnecessary columns like passengerid,name,ticket.
train.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
test=test_df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

# **Checking missing values and handling**

In [None]:
# Data cleaning
def data_cleaning(df):
  ## 1 -step make the list of features which has missing values
  features_with_na=[features for features in df.columns if df[features].isnull().sum()>0]

  # If there is missing values filling missing values
  for feature in features_with_na:
    df[feature].fillna(df[feature].mode()[0],inplace=True)

In [None]:
### filling missing values of train data
data_cleaning(train)

In [None]:
### filling missing values of test data
data_cleaning(test)

# categorical columns handling

In [None]:
train['Sex']=train['Sex'].replace(['male','female'],[1,0])
train['Embarked']=train['Embarked'].replace(['S','C','Q'],[0,1,2])
test['Sex']=train['Sex'].replace(['male','female'],[1,0])
test['Embarked']=train['Embarked'].replace(['S','C','Q'],[0,1,2])

# univarite analysis

In [None]:
from matplotlib import gridspec
# distribution of anomalous features
features =train.drop(columns=['Survived']).columns
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, c in enumerate(train[features]):
    ax = plt.subplot(gs[i])
    sns.distplot(train[c][train['Survived'] ==1], bins=5,label='Survived')
    sns.distplot(train[c][train['Survived'] ==0], bins=5,label='NOT Survived')
    ax.set_xlabel('')
    ax.legend()
    ax.set_title('histogram of feature: ' + str(c))
plt.show()

In [None]:
import plotly.express as px
for i in features:
  fig = px.pie(train,values=train[i], names=train['Survived'],height=350
               ,title=i)
  fig.update_traces(textposition='inside', textinfo='percent+label')
  fig.show()

In [None]:
'''
Boxplot is a 5 point summary plot. It gives the information about the maximum, minimum, mean,
first quartile, and third quartile of a continuous variable. Also, it equips us with knowledge of outliers.
'''
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, c in enumerate(train[features]):
  ax = plt.subplot(gs[i])
  sns.boxplot(y = train[c],x =train['Survived'])
  ax.set_xlabel('')

# **Creating independent and dependent variables**

In [None]:
X=train.drop(columns=['Survived'])
y=train['Survived']

In [None]:
## we are doing over sampling here for removing the imbalanced form of data.
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

# **Splitting dataset for validation and training**

In [None]:
from sklearn.model_selection import train_test_split

### Taking 80% training dataset as the training and 20% dataset as the validation
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=0)

# Data standardrization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_val_scaled=scaler.fit_transform(X_val)

# **Random Forest model**

# **Hyper parameter tunning**

In [None]:
#Apply hyper-parameter tuning techniques to get the best accuracy.
# import comand of gridsearchcv
from sklearn.model_selection import GridSearchCV
# choosing different parameter for test
params={
     'n_estimators':[50,100,150],
    'max_depth':[20,25,30],
    'criterion':["gini", "entropy"],
    'min_samples_split':[2,3,4,5,6,7]
}
# Random forest model
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=0,)

# Applying Gridsearchcv by giving model random forest
RF_models=GridSearchCV(model, param_grid=params, n_jobs=-1, cv=2)
RF_models.fit(X_train,y_train)

# Checking random forest best parameters
RF_models.best_params_

# **Model training**

In [None]:
# fitting model by using the best parameters that we get using random search
classifier1 = RF_models.best_estimator_

class_weights = {0:0.3, 1:0.7}
RF_model=classifier1.fit(X_train,y_train,sample_weight=[class_weights[y] for y in y_train])
### Model accuracy score
RF_model.score(X_val,y_val)

In [None]:
### Accuracy score for training data
RF_model.score(X_train,y_train)

# **Logitic Regression**

# **Hyper parameter tunning**

In [None]:
#Apply hyper-parameter tuning techniques to get the best accuracy.

# choosing different parameter for test
params={
     'solver':['newton-cg', 'lbfgs', 'liblinear'],
      'penalty':['l1', 'l2', 'elasticnet'],
      'C':[1.0, 0.1, 0.01]
}
# Random forest model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(random_state=0,)

# Applying Gridsearchcv by giving model random forest
LR_model=GridSearchCV(model, param_grid=params, n_jobs=-1, cv=2)
LR_model.fit(X_train_scaled,y_train)

# Checking random forest best parameters
LR_model.best_params_

# **Model Fitting**

In [None]:
#  Use any Supervised Learning technique to train a model
# fitting model by using the best parameters that we get using random search
classifier2=LR_model.best_estimator_
LR_model=classifier2.fit(X_train_scaled,y_train)
### Model accuracy score
LR_model.score(X_val_scaled,y_val)

In [None]:
### Accuracy score for training data
LR_model.score(X_train_scaled,y_train)

# **Getting prediction on test data**

In [None]:
# test=scaler.transform(test)
pred=RF_model.predict(test)
Prediction=pd.DataFrame(pred)
Prediction.rename(columns={0: 'Survived'}, inplace=True)
Prediction['PassengerId']=test_df['PassengerId']
cols = list(Prediction)
cols[0], cols[1] = cols[1], cols[0]
Prediction=Prediction.loc[:,cols]

In [None]:
Prediction

In [None]:
Prediction.to_csv('Model_prediction.csv')