In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

In [42]:
df=pd.read_csv("cleaned_titanic.csv")

In [43]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [44]:
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,True,False,True
1,1,1,38.000000,1,0,71.2833,False,False,False
2,1,3,26.000000,0,0,7.9250,False,False,True
3,1,1,35.000000,1,0,53.1000,False,False,True
4,0,3,35.000000,0,0,8.0500,True,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,True,False,True
887,1,1,19.000000,0,0,30.0000,False,False,True
888,0,3,29.699118,1,2,23.4500,False,False,True
889,1,1,26.000000,0,0,30.0000,True,False,False


In [45]:
X=df.drop("Survived",axis=1)
y=df["Survived"]

In [46]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [47]:
rf=RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=42
)

In [48]:
rf.fit(X_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [49]:
y_pred=rf.predict(X_test)

In [50]:
print("Accuracy: ",accuracy_score(y_test,y_pred))
print("Confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))

Accuracy:  0.8100558659217877
Confusion matrix:
 [[95 10]
 [24 50]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.90      0.85       105
           1       0.83      0.68      0.75        74

    accuracy                           0.81       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [51]:
feature_imp=pd.DataFrame({'Features':X.columns,'Importance':rf.feature_importances_})

In [52]:
feature_imp=feature_imp.sort_values(by='Importance',ascending=False)

In [53]:
print(feature_imp)

     Features  Importance
5    Sex_male    0.516918
0      Pclass    0.170615
4        Fare    0.140066
1         Age    0.079073
2       SibSp    0.044526
7  Embarked_S    0.023675
3       Parch    0.020060
6  Embarked_Q    0.005067


<h1>Regressor</h1>

In [54]:
X_=df.drop("Fare",axis=1)
y_=df["Fare"]

In [55]:
X_train_,X_test_,y_train_,y_test_=train_test_split(X_,y_,test_size=0.2,random_state=42)

In [56]:
rfr=RandomForestRegressor(
    n_estimators=100,
    max_depth=3,
    random_state=42
)

In [57]:
rfr.fit(X_train_,y_train_)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [58]:
y_pred_=rfr.predict(X_test_)

In [59]:
mse=np.sqrt(mean_squared_error(y_test_,y_pred_))
print("MSE: ",mse)

MSE:  29.554822211498017


In [60]:
print(f"Min percentage error: {((np.sqrt(mse)/df["Fare"].max())*100).round(2)}")

Min percentage error: 1.06


In [61]:
print("R2 score: ",r2_score(y_test_,y_pred_))

R2 score:  0.435523590226798
