# 1 Import Libraries

In [1]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [3]:
train_df=pd.read_csv('/kaggle/input/titanic/train.csv')
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.05,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.00,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.00,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.00,C148,C


# 2 EDA Process

In [4]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Age, cabin and Embarked are missing values

In [5]:
train_df[['Age','Cabin','Embarked']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Cabin     204 non-null    object 
 2   Embarked  889 non-null    object 
dtypes: float64(1), object(2)
memory usage: 21.0+ KB


In [6]:
len(train_df['Cabin'].unique()),train_df.shape[0]

(148, 891)

In [7]:
train_df['Age'].describe()

count   714.00
mean     29.70
std      14.53
min       0.42
25%      20.12
50%      28.00
75%      38.00
max      80.00
Name: Age, dtype: float64

### 1. Age (numerical variable) feature has outliers so replace the null values with median
### 2. Cabin and Embarked are categorical vaiable so replace wth mode 

## 2.1 Replace NULL values

In [8]:
train_df.fillna({'Age':train_df['Age'].median()},inplace=True)
train_df.fillna({'Cabin':train_df['Cabin'].mode().iloc[0]},inplace=True)
train_df.fillna({'Embarked':train_df['Embarked'].mode().iloc[0]},inplace=True)
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.92,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.10,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.05,B96 B98,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.00,B96 B98,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.00,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.00,1,2,W./C. 6607,23.45,B96 B98,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.00,C148,C


In [9]:
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
train_df['Age'].dtypes

dtype('float64')

### Perform Label Encodeing 

In [12]:
encoder=LabelEncoder()
def label_encoder(col):
    if col.dtypes=='object':
        col=encoder.fit_transform(col)
    return col

In [13]:
train_df=train_df.apply(lambda col:label_encoder(col))
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.00,1,0,523,7.25,47,2
1,2,1,1,190,0,38.00,1,0,596,71.28,81,0
2,3,1,3,353,0,26.00,0,0,669,7.92,47,2
3,4,1,1,272,0,35.00,1,0,49,53.10,55,2
4,5,0,3,15,1,35.00,0,0,472,8.05,47,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,548,1,27.00,0,0,101,13.00,47,2
887,888,1,1,303,0,19.00,0,0,14,30.00,30,2
888,889,0,3,413,0,28.00,1,2,675,23.45,47,2
889,890,1,1,81,1,26.00,0,0,8,30.00,60,0


## 2.2 Feature Selection

In [14]:
X=train_df.drop(['Survived'],axis=1)
y=train_df['Survived']

In [15]:
f_p_value=chi2(X,y)

In [16]:
p_value=pd.Series(f_p_value[1])
p_value.index=X.columns
p_value.sort_index(ascending=True)

Age           0.00
Cabin         0.00
Embarked      0.00
Fare          0.00
Name          0.00
Parch         0.00
PassengerId   0.07
Pclass        0.00
Sex           0.00
SibSp         0.11
Ticket        0.00
dtype: float64

### Ticket,sibsp corelate less to the output variable
### So delete the Ticket,sibsp from dataframe

In [17]:
train_df.drop(['Ticket','SibSp'],axis=1,inplace=True)
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Parch,Fare,Cabin,Embarked
0,1,0,3,108,1,22.00,0,7.25,47,2
1,2,1,1,190,0,38.00,0,71.28,81,0
2,3,1,3,353,0,26.00,0,7.92,47,2
3,4,1,1,272,0,35.00,0,53.10,55,2
4,5,0,3,15,1,35.00,0,8.05,47,2
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,548,1,27.00,0,13.00,47,2
887,888,1,1,303,0,19.00,0,30.00,30,2
888,889,0,3,413,0,28.00,2,23.45,47,2
889,890,1,1,81,1,26.00,0,30.00,60,0


# 3 Model Selection

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=123)

In [19]:
Classifier=LazyClassifier()
model,predict=Classifier.fit(X_train,X_test,y_train,y_test)
model

100%|██████████| 29/29 [00:02<00:00, 13.85it/s]

[LightGBM] [Info] Number of positive: 244, number of negative: 379
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 865
[LightGBM] [Info] Number of data points in the train set: 623, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.391653 -> initscore=-0.440368
[LightGBM] [Info] Start training from score -0.440368





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.85,0.84,0.84,0.85,0.14
AdaBoostClassifier,0.85,0.83,0.83,0.85,0.16
LGBMClassifier,0.84,0.82,0.82,0.84,0.11
KNeighborsClassifier,0.83,0.82,0.82,0.83,0.04
RandomForestClassifier,0.83,0.81,0.81,0.83,0.32
SVC,0.82,0.8,0.8,0.82,0.04
NuSVC,0.82,0.8,0.8,0.82,0.04
ExtraTreesClassifier,0.82,0.8,0.8,0.82,0.24
BaggingClassifier,0.82,0.79,0.79,0.82,0.07
NearestCentroid,0.79,0.79,0.79,0.79,0.02


### XGBClassifier produce best accuracy 0.85 for this data

# 4 Model Training

## 4.1 XGBClassifier

In [20]:
model=XGBClassifier()
model.fit(X_train,y_train)

In [21]:
print('Training accuracy of the model:',model.score(X_train,y_train))

Training accuracy of the model: 1.0


In [22]:
print('Testing accuracy of the model:',accuracy_score(y_test,model.predict(X_test)))

Testing accuracy of the model: 0.8470149253731343


### 4.1.1 HyperParameter Tuning using OPTUNA

In [23]:
## define a objective function to optimize
def objective(trail):
    params={
        'max_depth':trail.suggest_int('max_depth',3,10),
        'learning_rate':trail.suggest_loguniform('learning_rate',0.001,0.1),
        'n_estimators':trail.suggest_int('n_estimators',100,1000),
        'subsample':trail.suggest_float('subsample',0.5,1),
        'colsample_bytree':trail.suggest_float('colsample_bytree',0.5,1),
    }
    
    model=XGBClassifier(**params)
    
    model.fit(X_train,y_train)
    
    y_pred=model.predict(X_test)
    
    accuracy=accuracy_score(y_test,y_pred)
    
    return accuracy

In [24]:
study=optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=50)
study.best_trial

[I 2024-05-07 08:52:18,927] A new study created in memory with name: no-name-fc46b2af-3492-4c56-b674-e616ffc69384
[I 2024-05-07 08:52:19,446] Trial 0 finished with value: 0.8507462686567164 and parameters: {'max_depth': 6, 'learning_rate': 0.006777431112069304, 'n_estimators': 623, 'subsample': 0.5188977845507314, 'colsample_bytree': 0.7549809060628453}. Best is trial 0 with value: 0.8507462686567164.
[I 2024-05-07 08:52:19,935] Trial 1 finished with value: 0.8470149253731343 and parameters: {'max_depth': 5, 'learning_rate': 0.052556693407444226, 'n_estimators': 775, 'subsample': 0.7710319083188717, 'colsample_bytree': 0.8185679613632615}. Best is trial 0 with value: 0.8507462686567164.
[I 2024-05-07 08:52:20,206] Trial 2 finished with value: 0.6492537313432836 and parameters: {'max_depth': 8, 'learning_rate': 0.0010632353976750698, 'n_estimators': 280, 'subsample': 0.5900761401529964, 'colsample_bytree': 0.5258524142449927}. Best is trial 0 with value: 0.8507462686567164.
[I 2024-05-0

FrozenTrial(number=44, state=TrialState.COMPLETE, values=[0.8843283582089553], datetime_start=datetime.datetime(2024, 5, 7, 8, 52, 33, 916690), datetime_complete=datetime.datetime(2024, 5, 7, 8, 52, 34, 100150), params={'max_depth': 9, 'learning_rate': 0.019354225003932343, 'n_estimators': 101, 'subsample': 0.9352226850816184, 'colsample_bytree': 0.6905130742434157}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None)}, trial_id=44, value=None)

In [25]:
best_params=study.best_trial.params

In [26]:
tuned_model=XGBClassifier(**best_params)
tuned_model.fit(X_train,y_train)

In [27]:
print('Training accuracy of the model:',tuned_model.score(X_train,y_train))

Training accuracy of the model: 0.9406099518459069


In [28]:
print('Testing accuracy of the model:',accuracy_score(y_test,tuned_model.predict(X_test)))

Testing accuracy of the model: 0.8843283582089553


# 5 Prediction

In [29]:
test_df=pd.read_csv('/kaggle/input/titanic/test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.50,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00,1,0,363272,7.00,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.00,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.00,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00,1,1,3101298,12.29,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.00,0,0,PC 17758,108.90,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.50,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S


In [30]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [31]:
test_df['Fare'].describe()

count   417.00
mean     35.63
std      55.91
min       0.00
25%       7.90
50%      14.45
75%      31.50
max     512.33
Name: Fare, dtype: float64

In [32]:
test_df.fillna({'Age':test_df['Age'].median()},inplace=True)
test_df.fillna({'Cabin':test_df['Cabin'].mode().iloc[0]},inplace=True)
test_df.fillna({'Fare':test_df['Fare'].median()},inplace=True)
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.50,0,0,330911,7.83,B57 B59 B63 B66,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00,1,0,363272,7.00,B57 B59 B63 B66,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.00,0,0,240276,9.69,B57 B59 B63 B66,Q
3,895,3,"Wirz, Mr. Albert",male,27.00,0,0,315154,8.66,B57 B59 B63 B66,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00,1,1,3101298,12.29,B57 B59 B63 B66,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,27.00,0,0,A.5. 3236,8.05,B57 B59 B63 B66,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.00,0,0,PC 17758,108.90,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.50,0,0,SOTON/O.Q. 3101262,7.25,B57 B59 B63 B66,S
416,1308,3,"Ware, Mr. Frederick",male,27.00,0,0,359309,8.05,B57 B59 B63 B66,S


In [33]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [34]:
result=pd.DataFrame(test_df['PassengerId'])

In [35]:
test_df=test_df.apply(lambda data:label_encoder(data))
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,206,1,34.50,0,0,152,7.83,15,1
1,893,3,403,0,47.00,1,0,221,7.00,15,2
2,894,2,269,1,62.00,0,0,73,9.69,15,1
3,895,3,408,1,27.00,0,0,147,8.66,15,2
4,896,3,178,0,22.00,1,1,138,12.29,15,2
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,353,1,27.00,0,0,267,8.05,15,2
414,1306,1,283,0,39.00,0,0,324,108.90,22,0
415,1307,3,332,1,38.50,0,0,346,7.25,15,2
416,1308,3,384,1,27.00,0,0,220,8.05,15,2


In [36]:
y_pred=tuned_model.predict(test_df)
result['Survived']=y_pred
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [37]:
result.to_csv('Submission.csv',index=False)