## IMPORT THE NECESSARY LIBRARIES 

In [128]:
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from dask_ml.preprocessing import LabelEncoder
import dask.array as da
from sklearn.metrics import f1_score

## LOAD THE DATASET DOWNLOADED FROM KAGGLE

In [106]:
df = dd.read_csv("tested.csv",dtype={'Cabin': 'object'})

In [107]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## DATA PREPROCESSING (HANDLING MISSING VALUES OR SHAPE DATA FOR MODEL)

AS PER ANALYSIS CABIN COLUMN HAS ALOT OF NAN VALUES SO WE CAN REMOVE IT

In [108]:
df = df.drop('Cabin', axis=1)

LETS CHECK THE NAN VALUES

In [109]:
df.isna().sum().compute()

PassengerId     0
Survived        0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64

LETS FILL THE NAN VALUES 

In [110]:
df['Age'] = df['Age'].fillna(df['Age'].median().compute())

In [111]:
df['Fare'] = df['Fare'].fillna(df['Fare'].median().compute())

WE HAVE NO MISSING VALUES NOW

In [112]:
df.isna().sum().compute()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

LETS CONVERT THE STRING VALUES TO NUMERIC SO THAT OUR MODEL CAN EASILY PREDICT

In [113]:
LE = LabelEncoder()

In [114]:
df['Sex'] = LE.fit_transform(df['Sex'])
df['Embarked'] = LE.fit_transform(df['Embarked'])
df = df.drop('Name', axis=1)
df['Ticket'] = dd.to_numeric(df['Ticket'], errors='coerce')
df['Ticket'] = df['Ticket'].fillna(df['Ticket'].mode().compute())
df = df.dropna(subset=['Ticket'])

NOW WE HAVE THE DATA INTO NUMERIC FORM

## SPLITTING THE DATA INTO TRAIN AND TEST

DEFINING THE FEATURES AND WHAT OUR TARGET IS

In [115]:
X = df.drop("Survived", axis=1)
Y = df['Survived']

AS WE ARE USING DASK DATAFRAME WE NEED TO COMPUTE TO GET VALUES 

In [116]:
X = X.compute()
Y = Y.compute()

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## BUILD THE MODEL

In [119]:
Forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

## TRAIN THE MODEL

In [120]:
Forest.fit(X_train, y_train)

## MAKE PREDICTIONS WITH OUR MODEL

In [121]:
y_pred = Forest.predict(X_test)

In [122]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0], dtype=int64)

## LETS SEE THE ACCURACY OF OUR MODEL

In [123]:
accuracy = accuracy_score(y_test, y_pred)

In [127]:
print(f'{accuracy*100:.2f}')

100.00


## LETS GET THE F1 SCORE AS WELL FOR MORE VALID ACCURACY

In [129]:
f1 = f1_score(y_test.values, y_pred)

In [131]:
f1

1.0