In [1]:
import numpy as np
import polars as pl

from sklearn.model_selection import train_test_split


In [2]:
data=pl.read_csv('dataset_vs/tested.csv')
data.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
892,0,3,"""Kelly, Mr. Jam…","""male""",34.5,0,0,"""330911""",7.8292,,"""Q"""
893,1,3,"""Wilkes, Mrs. J…","""female""",47.0,1,0,"""363272""",7.0,,"""S"""
894,0,2,"""Myles, Mr. Tho…","""male""",62.0,0,0,"""240276""",9.6875,,"""Q"""
895,0,3,"""Wirz, Mr. Albe…","""male""",27.0,0,0,"""315154""",8.6625,,"""S"""
896,1,3,"""Hirvonen, Mrs.…","""female""",22.0,1,1,"""3101298""",12.2875,,"""S"""


In [3]:
data.glimpse

<bound method DataFrame.glimpse of shape: (418, 12)
┌─────────────┬──────────┬────────┬──────────────────┬───┬────────────┬─────────┬───────┬──────────┐
│ PassengerId ┆ Survived ┆ Pclass ┆ Name             ┆ … ┆ Ticket     ┆ Fare    ┆ Cabin ┆ Embarked │
│ ---         ┆ ---      ┆ ---    ┆ ---              ┆   ┆ ---        ┆ ---     ┆ ---   ┆ ---      │
│ i64         ┆ i64      ┆ i64    ┆ str              ┆   ┆ str        ┆ f64     ┆ str   ┆ str      │
╞═════════════╪══════════╪════════╪══════════════════╪═══╪════════════╪═════════╪═══════╪══════════╡
│ 892         ┆ 0        ┆ 3      ┆ Kelly, Mr. James ┆ … ┆ 330911     ┆ 7.8292  ┆ null  ┆ Q        │
│ 893         ┆ 1        ┆ 3      ┆ Wilkes, Mrs.     ┆ … ┆ 363272     ┆ 7.0     ┆ null  ┆ S        │
│             ┆          ┆        ┆ James (Ellen     ┆   ┆            ┆         ┆       ┆          │
│             ┆          ┆        ┆ Needs)           ┆   ┆            ┆         ┆       ┆          │
│ 894         ┆ 0        ┆ 2      ┆ Myl

In [4]:
data.null_count()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,86,0,0,0,1,327,0


In [5]:
# Defining Target and deleting least relevent columns
X=data.clone()
X=X.sample(len(X))
y=X['Survived']
X=X.drop(columns=["Name","Survived","Cabin","Ticket"])
X.head()

PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
i64,i64,str,f64,i64,i64,f64,str
892,3,"""male""",34.5,0,0,7.8292,"""Q"""
893,3,"""female""",47.0,1,0,7.0,"""S"""
894,2,"""male""",62.0,0,0,9.6875,"""Q"""
895,3,"""male""",27.0,0,0,8.6625,"""S"""
896,3,"""female""",22.0,1,1,12.2875,"""S"""


In [6]:
# Handling categorical values
cat_columns=['Sex','Embarked']

from sklearn.preprocessing import OneHotEncoder
oh_encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)
oh_col=pl.DataFrame(oh_encoder.fit_transform(X[cat_columns]))
X=X.drop(columns=cat_columns)
X=pl.concat([X,oh_col],how="horizontal")




In [7]:
# handling missing values
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")
X_imputed=pl.DataFrame(imputer.fit_transform(X))
X_imputed.columns=X.columns
X=X_imputed

In [8]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=2)

In [9]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model_sgdc=SGDClassifier(random_state=2)
model_dtc=DecisionTreeClassifier(random_state=2)
model_rfc=RandomForestClassifier(random_state=2)

models=[model_sgdc,model_dtc,model_rfc]
for i in models:
    i.fit(X_train,y_train)
    pred=i.predict(X_valid)
    score=mean_absolute_error(y_valid,pred)
    print(f"{i} MAE = ",score)

SGDClassifier(random_state=2) MAE =  0.34523809523809523
DecisionTreeClassifier(random_state=2) MAE =  0.0
RandomForestClassifier(random_state=2) MAE =  0.0


In [10]:
model=DecisionTreeClassifier(random_state=2)
model.fit(X,y)

In [11]:
model.score(X,y)

1.0

In [12]:
y=pl.DataFrame(y)
new_X=pl.concat([X,y],how="horizontal")
titanic_corr=new_X.corr()
titanic_corr

PassengerId,Pclass,Age,SibSp,Parch,Fare,column_0,column_1,column_2,column_3,column_4,Survived
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,-0.026751,-0.031447,0.003818,0.04308,0.008633,-0.023245,0.023245,-0.027419,-0.035731,0.048008,-0.023245
-0.026751,1.0,-0.467853,0.001087,0.018721,-0.577313,-0.108615,0.108615,-0.318543,0.252608,0.120783,-0.108615
-0.031447,-0.467853,1.0,-0.071197,-0.043731,0.342357,0.008035,-0.008035,0.162147,-0.04131,-0.118594,0.008035
0.003818,0.001087,-0.071197,1.0,0.306895,0.171912,0.099943,-0.099943,-0.016365,-0.098824,0.079367,0.099943
0.04308,0.018721,-0.043731,0.306895,1.0,0.230325,0.15912,-0.15912,-0.005791,-0.132948,0.092201,0.15912
0.008633,-0.577313,0.342357,0.171912,0.230325,1.0,0.192036,-0.192036,0.312503,-0.155193,-0.179106,0.192036
-0.023245,-0.108615,0.008035,0.099943,0.15912,0.192036,1.0,-1.0,0.033684,0.115574,-0.105883,1.0
0.023245,0.108615,-0.008035,-0.099943,-0.15912,-0.192036,-1.0,1.0,-0.033684,-0.115574,0.105883,-1.0
-0.027419,-0.318543,0.162147,-0.016365,-0.005791,0.312503,0.033684,-0.033684,1.0,-0.199786,-0.767375,0.033684
-0.035731,0.252608,-0.04131,-0.098824,-0.132948,-0.155193,0.115574,-0.115574,-0.199786,1.0,-0.474962,0.115574


## hence we have trained our model successfully.
#### > The model Shows very high accuracy because the data is biased.
#### > According to given data all males are dead and all females survived, as we can see this in above correlation.
#### > Also the the Amount of data given for training is not much sufficient.