In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE

In [5]:
titanic = pd.read_csv(r"artifacts/raw/titanic_train.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
1,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
2,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          572 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


In [9]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

In [10]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 66.9+ KB


In [11]:
titanic["Sex"] = titanic['Sex'].map({'male':0,"female":1})

In [12]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    int64  
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(6), object(4)
memory usage: 66.9+ KB


In [13]:
titanic['Embarked'] = titanic['Embarked'].astype('category').cat.codes

In [14]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    int64  
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     712 non-null    int8   
dtypes: float64(2), int64(6), int8(1), object(3)
memory usage: 62.0+ KB


In [15]:
titanic["Name"].value_counts()

Name
White, Mr. Richard Frasar      1
Partner, Mr. Austen            1
Berriman, Mr. William John     1
Tikkanen, Mr. Juho             1
Hansen, Mr. Henrik Juul        1
                              ..
Hunt, Mr. George Henry         1
Van Impe, Mr. Jean Baptiste    1
Bystrom, Mrs. (Karolina)       1
Woolner, Mr. Hugh              1
Todoroff, Mr. Lalio            1
Name: count, Length: 712, dtype: int64

In [17]:
titanic['FamilySize'] = titanic['SibSp'] + titanic["Parch"] + 1

titanic["IsAlone"] = (titanic["FamilySize"] == 1).astype(int)

titanic["HasCabin"] = titanic["Cabin"].notnull().astype(int)


titanic["Title"] = titanic['Name'].str.extract(' ([A-Za-z]+)\.',expand=False).map({
    "Mr":0,"Miss":1,"Mrs":2,"Master":3,"Rare":4}).fillna(4)

titanic["Pclass_Fare"] = titanic['Pclass'] * titanic["Fare"]

titanic["Age_Fare"] = titanic["Age"] * titanic["Fare"]

In [19]:
titanic['Title'].value_counts()

Title
0.0    419
1.0    143
2.0     96
3.0     33
4.0     21
Name: count, dtype: int64

In [20]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    int64  
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        159 non-null    object 
 11  Embarked     712 non-null    int8   
 12  FamilySize   712 non-null    int64  
 13  IsAlone      712 non-null    int64  
 14  HasCabin     712 non-null    int64  
 15  Title        712 non-null    float64
 16  Pclass_Fare  712 non-null    float64
 17  Age_Fare     712 non-null    float64
dtypes: float64(5), int64(9), int8(1), object(3)
memory

In [21]:
X = titanic[["Pclass","Sex","Age","Fare","Embarked","FamilySize","IsAlone","HasCabin","Title","Pclass_Fare","Age_Fare"]]
y=titanic["Survived"]

In [22]:
titanic["Survived"].value_counts()

Survived
0    444
1    268
Name: count, dtype: int64

In [23]:
smote= SMOTE(random_state=42)
X_resampled , y_resampled = smote.fit_resample(X,y)

In [24]:
y_resampled.value_counts()

Survived
0    444
1    444
Name: count, dtype: int64

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)

In [34]:
param_distributions = {
    "n_estimators": [100,200,300],
    "max_depth" : [10,20,30],
    "min_samples_split" : [2,5],
    "min_samples_leaf":[1,2]
}

In [35]:
rf = RandomForestClassifier(random_state=42)

In [36]:
random_search = RandomizedSearchCV(
    rf,param_distributions,n_iter=10,cv=3,scoring='accuracy',random_state=42
)

In [37]:
random_search.fit(X_train,y_train)

In [38]:
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test,y_pred)

In [39]:
print(f"Random Foreset Accuracy : {rf_accuracy:.2f}")

Random Foreset Accuracy : 0.86


In [40]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85        84
           1       0.88      0.85      0.86        94

    accuracy                           0.86       178
   macro avg       0.86      0.86      0.86       178
weighted avg       0.86      0.86      0.86       178

