Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton )
\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the 5th Row"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[],"source":["# Print the Name Column"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[],"source":["# Create the plot below"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","train_df.head(10)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","test_df.head(10)"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Pclass"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.843395Z","iopub.status.busy":"2024-04-01T06:27:55.842833Z","iopub.status.idle":"2024-04-01T06:27:56.089225Z","shell.execute_reply":"2024-04-01T06:27:56.087578Z","shell.execute_reply.started":"2024-04-01T06:27:55.843168Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Pclass\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.092270Z","iopub.status.busy":"2024-04-01T06:27:56.091722Z","iopub.status.idle":"2024-04-01T06:27:56.162888Z","shell.execute_reply":"2024-04-01T06:27:56.161841Z","shell.execute_reply.started":"2024-04-01T06:27:56.092186Z"},"trusted":true},"outputs":[],"source":["train_df[\"Pclass\"] = train_df[\"Pclass\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns= [\"Pclass\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Pclass\"] = test_df[\"Pclass\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns= [\"Pclass\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Sex"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.164709Z","iopub.status.busy":"2024-04-01T06:27:56.164391Z","iopub.status.idle":"2024-04-01T06:27:56.205775Z","shell.execute_reply":"2024-04-01T06:27:56.204761Z","shell.execute_reply.started":"2024-04-01T06:27:56.164639Z"},"trusted":true},"outputs":[],"source":["train_df[\"Sex\"] = train_df[\"Sex\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns=[\"Sex\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Drop Passenger ID and Cabin (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.207602Z","iopub.status.busy":"2024-04-01T06:27:56.207299Z","iopub.status.idle":"2024-04-01T06:27:56.215886Z","shell.execute_reply":"2024-04-01T06:27:56.214401Z","shell.execute_reply.started":"2024-04-01T06:27:56.207550Z"},"trusted":true},"outputs":[],"source":["train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.217917Z","iopub.status.busy":"2024-04-01T06:27:56.217536Z","iopub.status.idle":"2024-04-01T06:27:56.228150Z","shell.execute_reply":"2024-04-01T06:27:56.227230Z","shell.execute_reply.started":"2024-04-01T06:27:56.217854Z"},"trusted":true},"outputs":[],"source":["train_df.columns"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the PassengerId and Cabin columns from the test set"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the columns of the test set"]},{"cell_type":"markdown","metadata":{},"source":[" \n","# Modeling"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.230086Z","iopub.status.busy":"2024-04-01T06:27:56.229809Z","iopub.status.idle":"2024-04-01T06:27:56.238557Z","shell.execute_reply":"2024-04-01T06:27:56.237679Z","shell.execute_reply.started":"2024-04-01T06:27:56.230040Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.svm import SVC\n","from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.metrics import accuracy_score"]},{"cell_type":"markdown","metadata":{},"source":["## Train - Test Split (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.240103Z","iopub.status.busy":"2024-04-01T06:27:56.239830Z","iopub.status.idle":"2024-04-01T06:27:56.256809Z","shell.execute_reply":"2024-04-01T06:27:56.255463Z","shell.execute_reply.started":"2024-04-01T06:27:56.240056Z"},"trusted":true},"outputs":[],"source":["train_df_len = len(train_df)\n","train_df_len"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.314730Z","iopub.status.busy":"2024-04-01T06:27:56.313986Z","iopub.status.idle":"2024-04-01T06:27:56.333564Z","shell.execute_reply":"2024-04-01T06:27:56.332507Z","shell.execute_reply.started":"2024-04-01T06:27:56.314635Z"},"trusted":true},"outputs":[],"source":["\n","train = train_df[:train_df_len]\n","test = test_df\n","\n","# Select all numerical values from train and test\n","numeric_train = train.select_dtypes(include=[np.number])\n","numeric_test = test.select_dtypes(include=[np.number]) \n","\n","\n","X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n","y_train = numeric_train[\"Survived\"]\n","\n","# Split the train data into train and test sets with a 1/3 ratio\n","X_train, X_test, y_train, y_test = # Use the train_test_split function here\n","\n","\n","print(\"X_train\", len(X_train))\n","print(\"X_test\", len(X_test))\n","print(\"y_train\", len(y_train))\n","print(\"y_test\", len(y_test))\n","print(\"test\", len(numeric_test))"]},{"cell_type":"markdown","metadata":{},"source":["## Simple Logistic Regression (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.335970Z","iopub.status.busy":"2024-04-01T06:27:56.335281Z","iopub.status.idle":"2024-04-01T06:27:56.368083Z","shell.execute_reply":"2024-04-01T06:27:56.366489Z","shell.execute_reply.started":"2024-04-01T06:27:56.335561Z"},"trusted":true},"outputs":[],"source":["logreg = LogisticRegression()\n","logreg.fit(X_train, y_train)\n","acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n","acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n","# Print the accuracy on the training and test set"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Hyperparameter Tuning -- Grid Search -- Cross Validation\n","We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n","\n","* Decision Tree\n","* SVM\n","* Random Forest\n","* KNN\n","* Logistic Regression"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.371066Z","iopub.status.busy":"2024-04-01T06:27:56.370400Z","iopub.status.idle":"2024-04-01T06:27:56.401742Z","shell.execute_reply":"2024-04-01T06:27:56.396867Z","shell.execute_reply.started":"2024-04-01T06:27:56.370802Z"},"trusted":true},"outputs":[],"source":["random_state = 42\n","classifier = [DecisionTreeClassifier(random_state = random_state),\n"," SVC(random_state = random_state),\n"," RandomForestClassifier(random_state = random_state),\n"," LogisticRegression(random_state = random_state),\n"," KNeighborsClassifier()]\n","\n","dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n"," \"max_depth\": range(1,20,2)}\n","\n","svc_param_grid = {\"kernel\" : [\"rbf\"],\n"," \"gamma\": [0.001, 0.01, 0.1, 1],\n"," \"C\": [1,10,50,100,200,300,1000]}\n","\n","rf_param_grid = {\"max_features\": [1,3,10],\n"," \"min_samples_split\":[2,3,10],\n"," \"min_samples_leaf\":[1,3,10],\n"," \"bootstrap\":[False],\n"," \"n_estimators\":[100,300],\n"," \"criterion\":[\"gini\"]}\n","\n","logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n"," \"penalty\": [\"l1\",\"l2\"]}\n","\n","knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n"," \"weights\": [\"uniform\",\"distance\"],\n"," \"metric\":[\"euclidean\",\"manhattan\"]}\n","classifier_param = [dt_param_grid,\n"," svc_param_grid,\n"," rf_param_grid,\n"," logreg_param_grid,\n"," knn_param_grid]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.413811Z","iopub.status.busy":"2024-04-01T06:27:56.404322Z","iopub.status.idle":"2024-04-01T06:29:38.718970Z","shell.execute_reply":"2024-04-01T06:29:38.717807Z","shell.execute_reply.started":"2024-04-01T06:27:56.413658Z"},"trusted":true},"outputs":[],"source":["cv_result = []\n","best_estimators = []\n","for i in range(len(classifier)):\n"," clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n"," clf.fit(X_train,y_train)\n"," cv_result.append(clf.best_score_)\n"," best_estimators.append(clf.best_estimator_)\n"," print(cv_result[i])"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:38.722928Z","iopub.status.busy":"2024-04-01T06:29:38.722207Z","iopub.status.idle":"2024-04-01T06:29:39.075423Z","shell.execute_reply":"2024-04-01T06:29:39.073987Z","shell.execute_reply.started":"2024-04-01T06:29:38.722582Z"},"trusted":true},"outputs":[],"source":["cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n"," \"LogisticRegression\",\n"," \"KNeighborsClassifier\"]})\n","\n","g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n","g.set_xlabel(\"Mean Accuracy\")\n","g.set_title(\"Cross Validation Scores\")"]},{"cell_type":"markdown","metadata":{},"source":["## Ensemble Modeling (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.078654Z","iopub.status.busy":"2024-04-01T06:29:39.077840Z","iopub.status.idle":"2024-04-01T06:29:39.862871Z","shell.execute_reply":"2024-04-01T06:29:39.860937Z","shell.execute_reply.started":"2024-04-01T06:29:39.078554Z"},"trusted":true},"outputs":[],"source":["votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n"," (\"rfc\",best_estimators[2]),\n"," (\"lr\",best_estimators[3])],\n"," voting = \"soft\", n_jobs = -1)\n","votingC = votingC.fit(X_train, y_train)\n","\n","# Print the accuracy score of the voting classifier"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the null values which are going to cause you an error in the next cell"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Prediction and Submission"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.865981Z","iopub.status.busy":"2024-04-01T06:29:39.865330Z","iopub.status.idle":"2024-04-01T06:29:39.977357Z","shell.execute_reply":"2024-04-01T06:29:39.973301Z","shell.execute_reply.started":"2024-04-01T06:29:39.865906Z"},"trusted":true},"outputs":[],"source":["test_survived = pd.Series(votingC.predict(numeric_test), name=\"Survived\").astype(int)\n","results = pd.concat([test_PassengerId, test_survived], axis=1)\n","results.to_csv(\"titanic.csv\", index=False)\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["# Congratulations on finishing the assignment!!\n","\n","### The submission is the titanic.csv which was just created, and this file which you have modified."]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"databundleVersionId":26502,"sourceId":3136,"sourceType":"competition"}],"dockerImageVersionId":29852,"isGpuEnabled":false,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.7"}},"nbformat":4,"nbformat_minor":4}
+{"cells":[{"cell_type":"markdown","metadata":{},"source":[" \n","# Ignore this"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n"]}],"source":["import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import matplotlib.pyplot as plt\n","plt.style.use(\"seaborn-v0_8-whitegrid\")\n","\n","import seaborn as sns\n","\n","from collections import Counter\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")"]},{"cell_type":"markdown","metadata":{},"source":[" \n","# Load and Check Data"]},{"cell_type":"markdown","metadata":{},"source":["DataFrames hold the dataset in a tabular format for easy manipulation and analysis. \n","CSV data is read into 'df' using Pandas' read_csv() function."]},{"cell_type":"code","execution_count":3,"metadata":{"_kg_hide-input":true,"execution":{"iopub.execute_input":"2024-04-01T06:45:27.416192Z","iopub.status.busy":"2024-04-01T06:45:27.415763Z","iopub.status.idle":"2024-04-01T06:45:27.433162Z","shell.execute_reply":"2024-04-01T06:45:27.431944Z","shell.execute_reply.started":"2024-04-01T06:45:27.416105Z"},"trusted":true},"outputs":[],"source":["train_df = pd.read_csv(\"./data/train.csv\")"]},{"cell_type":"markdown","metadata":{},"source":["### 1. Try to read the test .csv file into test_df"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.436611Z","iopub.status.busy":"2024-04-01T06:45:27.435916Z","iopub.status.idle":"2024-04-01T06:45:27.449974Z","shell.execute_reply":"2024-04-01T06:45:27.448230Z","shell.execute_reply.started":"2024-04-01T06:45:27.436517Z"},"trusted":true},"outputs":[],"source":["test_df = pd.read_csv(\"./data/test.csv\")\n","test_PassengerId = test_df[\"PassengerId\"]"]},{"cell_type":"code","execution_count":5,"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","execution":{"iopub.execute_input":"2024-04-01T06:45:27.452397Z","iopub.status.busy":"2024-04-01T06:45:27.451949Z","iopub.status.idle":"2024-04-01T06:45:27.462622Z","shell.execute_reply":"2024-04-01T06:45:27.461859Z","shell.execute_reply.started":"2024-04-01T06:45:27.452348Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["The Columns of train_df are: \n"]},{"data":{"text/plain":["Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n"," 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n"," dtype='object')"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["print(\"The Columns of train_df are: \")\n","train_df.columns"]},{"cell_type":"markdown","metadata":{},"source":["### We can use head() to see the first few rows in the dataframe"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.464289Z","iopub.status.busy":"2024-04-01T06:45:27.463866Z","iopub.status.idle":"2024-04-01T06:45:27.491984Z","shell.execute_reply":"2024-04-01T06:45:27.491110Z","shell.execute_reply.started":"2024-04-01T06:45:27.464242Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked
\n","
\n"," \n"," \n","
\n","
0
\n","
1
\n","
0
\n","
3
\n","
Braund, Mr. Owen Harris
\n","
male
\n","
22.0
\n","
1
\n","
0
\n","
A/5 21171
\n","
7.2500
\n","
NaN
\n","
S
\n","
\n","
\n","
1
\n","
2
\n","
1
\n","
1
\n","
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n","
female
\n","
38.0
\n","
1
\n","
0
\n","
PC 17599
\n","
71.2833
\n","
C85
\n","
C
\n","
\n","
\n","
2
\n","
3
\n","
1
\n","
3
\n","
Heikkinen, Miss. Laina
\n","
female
\n","
26.0
\n","
0
\n","
0
\n","
STON/O2. 3101282
\n","
7.9250
\n","
NaN
\n","
S
\n","
\n","
\n","
3
\n","
4
\n","
1
\n","
1
\n","
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n","
female
\n","
35.0
\n","
1
\n","
0
\n","
113803
\n","
53.1000
\n","
C123
\n","
S
\n","
\n","
\n","
4
\n","
5
\n","
0
\n","
3
\n","
Allen, Mr. William Henry
\n","
male
\n","
35.0
\n","
0
\n","
0
\n","
373450
\n","
8.0500
\n","
NaN
\n","
S
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S "]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["train_df.head()"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.494326Z","iopub.status.busy":"2024-04-01T06:45:27.493637Z","iopub.status.idle":"2024-04-01T06:45:27.541999Z","shell.execute_reply":"2024-04-01T06:45:27.541210Z","shell.execute_reply.started":"2024-04-01T06:45:27.494251Z"},"jupyter":{"source_hidden":true},"trusted":true},"outputs":[{"data":{"text/html":["
"],"text/plain":[" PassengerId Pclass Name Sex \\\n","0 892 3 Kelly, Mr. James male \n","1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n","2 894 2 Myles, Mr. Thomas Francis male \n","3 895 3 Wirz, Mr. Albert male \n","4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n","\n"," Age SibSp Parch Ticket Fare Cabin Embarked \n","0 34.5 0 0 330911 7.8292 NaN Q \n","1 47.0 1 0 363272 7.0000 NaN S \n","2 62.0 0 0 240276 9.6875 NaN Q \n","3 27.0 0 0 315154 8.6625 NaN S \n","4 22.0 1 1 3101298 12.2875 NaN S "]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["### 3. Now try checking for a description of test_df's data"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"data":{"text/html":["
Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton )
\n","\n"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","RangeIndex: 891 entries, 0 to 890\n","Data columns (total 12 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 PassengerId 891 non-null int64 \n"," 1 Survived 891 non-null int64 \n"," 2 Pclass 891 non-null int64 \n"," 3 Name 891 non-null object \n"," 4 Sex 891 non-null object \n"," 5 Age 714 non-null float64\n"," 6 SibSp 891 non-null int64 \n"," 7 Parch 891 non-null int64 \n"," 8 Ticket 891 non-null object \n"," 9 Fare 891 non-null float64\n"," 10 Cabin 204 non-null object \n"," 11 Embarked 889 non-null object \n","dtypes: float64(2), int64(5), object(5)\n","memory usage: 83.7+ KB\n"]}],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[{"data":{"text/plain":["PassengerId 3\n","Survived 1\n","Pclass 3\n","Name Heikkinen, Miss. Laina\n","Sex female\n","Age 26.0\n","SibSp 0\n","Parch 0\n","Ticket STON/O2. 3101282\n","Fare 7.925\n","Cabin NaN\n","Embarked S\n","Name: 2, dtype: object"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"data":{"text/plain":["PassengerId 6\n","Survived 0\n","Pclass 3\n","Name Moran, Mr. James\n","Sex male\n","Age NaN\n","SibSp 0\n","Parch 0\n","Ticket 330877\n","Fare 8.4583\n","Cabin NaN\n","Embarked Q\n","Name: 5, dtype: object"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["# Print the 5th Row\n","train_df.iloc[5]"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 male\n","1 female\n","2 female\n","3 female\n","4 male\n"," ... \n","886 male\n","887 female\n","888 female\n","889 male\n","890 male\n","Name: Sex, Length: 891, dtype: object"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 Braund, Mr. Owen Harris\n","1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n","2 Heikkinen, Miss. Laina\n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n","4 Allen, Mr. William Henry\n"," ... \n","886 Montvila, Rev. Juozas\n","887 Graham, Miss. Margaret Edith\n","888 Johnston, Miss. Catherine Helen \"Carrie\"\n","889 Behr, Mr. Karl Howell\n","890 Dooley, Mr. Patrick\n","Name: Name, Length: 891, dtype: object"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# Print the Name Column\n","train_df['Name']"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","sns.countplot(x='Pclass', data=train_df)\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":17,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","sns.countplot(x='Embarked', data=train_df)\n","plt.title('Count of Passengers by Embarkation Point')\n","plt.xlabel('Embarkation Point')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['C', 'Q', 'S'])\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":20,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Create the plot below\n","plt.figure(figsize=(8, 6))\n","sns.histplot(train_df['Age'], bins=20, color='green')\n","plt.title('Distribution of Age')\n","plt.xlabel('Age')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":21,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked
\n","
\n"," \n"," \n","
\n","
5
\n","
6
\n","
0
\n","
3
\n","
Moran, Mr. James
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
330877
\n","
8.4583
\n","
NaN
\n","
Q
\n","
\n","
\n","
17
\n","
18
\n","
1
\n","
2
\n","
Williams, Mr. Charles Eugene
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
244373
\n","
13.0000
\n","
NaN
\n","
S
\n","
\n","
\n","
19
\n","
20
\n","
1
\n","
3
\n","
Masselmani, Mrs. Fatima
\n","
female
\n","
NaN
\n","
0
\n","
0
\n","
2649
\n","
7.2250
\n","
NaN
\n","
C
\n","
\n","
\n","
26
\n","
27
\n","
0
\n","
3
\n","
Emir, Mr. Farred Chehab
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
2631
\n","
7.2250
\n","
NaN
\n","
C
\n","
\n","
\n","
28
\n","
29
\n","
1
\n","
3
\n","
O'Dwyer, Miss. Ellen \"Nellie\"
\n","
female
\n","
NaN
\n","
0
\n","
0
\n","
330959
\n","
7.8792
\n","
NaN
\n","
Q
\n","
\n","
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
\n","
\n","
859
\n","
860
\n","
0
\n","
3
\n","
Razi, Mr. Raihed
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
2629
\n","
7.2292
\n","
NaN
\n","
C
\n","
\n","
\n","
863
\n","
864
\n","
0
\n","
3
\n","
Sage, Miss. Dorothy Edith \"Dolly\"
\n","
female
\n","
NaN
\n","
8
\n","
2
\n","
CA. 2343
\n","
69.5500
\n","
NaN
\n","
S
\n","
\n","
\n","
868
\n","
869
\n","
0
\n","
3
\n","
van Melkebeke, Mr. Philemon
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
345777
\n","
9.5000
\n","
NaN
\n","
S
\n","
\n","
\n","
878
\n","
879
\n","
0
\n","
3
\n","
Laleff, Mr. Kristo
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
349217
\n","
7.8958
\n","
NaN
\n","
S
\n","
\n","
\n","
888
\n","
889
\n","
0
\n","
3
\n","
Johnston, Miss. Catherine Helen \"Carrie\"
\n","
female
\n","
NaN
\n","
1
\n","
2
\n","
W./C. 6607
\n","
23.4500
\n","
NaN
\n","
S
\n","
\n"," \n","
\n","
177 rows × 12 columns
\n","
"],"text/plain":[" PassengerId Survived Pclass Name \\\n","5 6 0 3 Moran, Mr. James \n","17 18 1 2 Williams, Mr. Charles Eugene \n","19 20 1 3 Masselmani, Mrs. Fatima \n","26 27 0 3 Emir, Mr. Farred Chehab \n","28 29 1 3 O'Dwyer, Miss. Ellen \"Nellie\" \n",".. ... ... ... ... \n","859 860 0 3 Razi, Mr. Raihed \n","863 864 0 3 Sage, Miss. Dorothy Edith \"Dolly\" \n","868 869 0 3 van Melkebeke, Mr. Philemon \n","878 879 0 3 Laleff, Mr. Kristo \n","888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n","\n"," Sex Age SibSp Parch Ticket Fare Cabin Embarked \n","5 male NaN 0 0 330877 8.4583 NaN Q \n","17 male NaN 0 0 244373 13.0000 NaN S \n","19 female NaN 0 0 2649 7.2250 NaN C \n","26 male NaN 0 0 2631 7.2250 NaN C \n","28 female NaN 0 0 330959 7.8792 NaN Q \n",".. ... ... ... ... ... ... ... ... \n","859 male NaN 0 0 2629 7.2292 NaN C \n","863 female NaN 8 2 CA. 2343 69.5500 NaN S \n","868 male NaN 0 0 345777 9.5000 NaN S \n","878 male NaN 0 0 349217 7.8958 NaN S \n","888 female NaN 1 2 W./C. 6607 23.4500 NaN S \n","\n","[177 rows x 12 columns]"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 86\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 1\n","Cabin 327\n","Embarked 0\n","dtype: int64"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[{"data":{"text/plain":[""]},"execution_count":24,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 S\n","1 C\n","2 S\n","3 S\n","4 S\n","Name: Embarked, dtype: object"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":26,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[{"data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAjMAAAGsCAYAAAAoiibJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAAApHklEQVR4nO3df1SUdf7//8csxACapqgk/mQ1i1AHhNVqdddcz/ojSxd0XWol39aqK+ies5mJVJqGlJh1UlJJM3/0Dn/QVqZv3d1q3czKxAVTNDHXotAClVwbYHKY7x99nU+zKNKEXrzkfjvHszvX65qZ5+WZ8H6ua2aweTwejwAAAAz1E6sHAAAA+DGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGC7R6gCvh3Llz+vrrr2W32/WTn9BvAACYoKamRtXV1WrZsqUCAy+eLE0iZr7++msdO3bM6jEAAIAfunbtqrCwsIuuN4mYsdvtkr77ywgJCbF4GgAAUB+VlZU6duyY99/xi2kSMXP+0lJISIhCQ0MtngYAAPwQl3qLCG8gAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgtECrBzBF3INrrB4BjUh+VrLVIwAA/n+cmQEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARrM0Zlwulx577DH97Gc/02233aZFixbJ4/FIkoqKijRmzBg5HA4lJiZq//79Pvd94403NHjwYDkcDqWkpOjUqVNWHAIAALCYpTHz+OOPa9euXVq5cqWeeuopbdiwQevXr5fT6dTEiRMVHx+vV155RbGxsZo0aZKcTqckad++fUpPT1dqaqrWr1+vM2fOKC0tzcpDAQAAFgm06okrKiqUl5enVatWqXfv3pKkCRMmqLCwUIGBgbLb7ZoxY4ZsNpvS09P1z3/+U9u2bVNCQoLWrVunYcOGadSoUZKkBQsW6Pbbb1dJSYk6depk1SEBAAALWHZmJj8/X82bN1ffvn292yZOnKjMzEwVFhYqLi5ONptNkmSz2dSnTx8VFBRIkgoLCxUfH++9X/v27RUREaHCwsIregwAAMB6lsVMSUmJOnTooFdffVVDhw7Vr371K2VnZ6umpkZlZWVq166dz/5hYWE6ceKEJOmrr76qcx0AADQdll1mcjqd+vTTT5Wbm6vMzEyVlZXp0UcfVUhIiCorKxUUFOSzf1BQkFwulySpqqqqzvWLcbvdcrvdDXsgaJJ4HQHA5Vffn7WWxUxgYKDOnj2rp556Sh06dJAklZaW6uWXX1aXLl1qhYnL5VJwcLAkyW63X3A9JCSkzuc8fPhwAx4BmrLzlzwBANazLGbatm0ru93uDRlJioyM1PHjx9W3b1+Vl5f77F9eXu69tBQeHn7B9bZt29b5nD169FBoaKh/A+ce8O9+uCrFxMRYPQIAXPWcTme9TkRYFjMOh0PV1dX697//rcjISEnS0aNH1aFDBzkcDj3//PPyeDyy2WzyeDzau3evJk+e7L1vfn6+EhISJEnHjx/X8ePH5XA46nzOgIAABQQEXN4DQ5PA6wgALr/6/qy17A3AP/3pTzVw4EClpaXp0KFDeuedd5STk6OkpCQNHTpUZ86cUUZGho4cOaKMjAxVVlZq2LBhkqSkpCS99tpr2rhxow4dOqQZM2Zo4MCBfCwbAIAmyNIvzVu4cKE6d+6spKQkPfTQQ7rnnns0btw4NW/eXMuXL/eefSksLFROTo73ElFsbKzmzp2r7OxsJSUlqWXLlsrMzLTyUAAAgEVsnvO/P+Aq5nQ6dfDgQUVFRfn9npm4B9c08FQwWX5WstUjAMBVr77/fvOLJgEAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYzdKY+dvf/qYbb7zR58+0adMkSUVFRRozZowcDocSExO1f/9+n/u+8cYbGjx4sBwOh1JSUnTq1CkrDgEAAFjM0pg5cuSIbr/9du3cudP75/HHH5fT6dTEiRMVHx+vV155RbGxsZo0aZKcTqckad++fUpPT1dqaqrWr1+vM2fOKC0tzcpDAQAAFrE0Zj755BP16NFDbdu29f5p0aKFtm7dKrvdrhkzZqhbt25KT09Xs2bNtG3bNknSunXrNGzYMI0aNUo33XSTFixYoB07dqikpMTKwwEAABawPGa6du1aa3thYaHi4uJks9kkSTabTX369FFBQYF3PT4+3rt/+/btFRERocLCwisxNgAAaEQCrXpij8ejf//739q5c6eWL18ut9utoUOHatq0aSorK1P37t199g8LC1NxcbEk6auvvlK7du1qrZ84caLO53S73XK73Q17IGiSeB0BwOVX35+1lsVMaWmpKisrFRQUpGeeeUaff/65Hn/8cVVVVXm3f19QUJBcLpckqaqqqs71izl8+HDDHgSarPNnCQEA1rMsZjp06KAPPvhALVu2lM1mU1RUlGpqavTggw+qb9++tcLE5XIpODhYkmS32y+4HhISUudz9ujRQ6Ghof4NnHvAv/vhqhQTE2P1CABw1XM6nfU6EWFZzEjSdddd53O7W7duqq6uVtu2bVVeXu6zVl5e7r20FB4efsH1tm3b1vl8AQEBCggI+PGDo8njdQQAl199f9Za9gbgd955R/369VNlZaV328GDB3XdddcpLi5O//rXv+TxeCR99/6avXv3yuFwSJIcDofy8/O99zt+/LiOHz/uXQcAAE2HZTETGxsru92uhx9+WEePHtWOHTu0YMEC3X///Ro6dKjOnDmjjIwMHTlyRBkZGaqsrNSwYcMkSUlJSXrttde0ceNGHTp0SDNmzNDAgQPVqVMnqw4HAABYxLKYad68uVauXKlTp04pMTFR6enpGjt2rO6//341b95cy5cvV35+vhISElRYWKicnBzv+11iY2M1d+5cZWdnKykpSS1btlRmZqZVhwIAACxk85y/lnMVczqdOnjwoKKiovx+A3Dcg2saeCqYLD8r2eoRAOCqV99/v/lFkwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwWqOJmYkTJ2rmzJne20VFRRozZowcDocSExO1f/9+n/3feOMNDR48WA6HQykpKTp16tSVHhkAADQCjSJmtmzZoh07dnhvO51OTZw4UfHx8XrllVcUGxurSZMmyel0SpL27dun9PR0paamav369Tpz5ozS0tKsGh8AAFjI8pipqKjQggUL1KtXL++2rVu3ym63a8aMGerWrZvS09PVrFkzbdu2TZK0bt06DRs2TKNGjdJNN92kBQsWaMeOHSopKbHqMAAAgEUsj5knn3xSI0eOVPfu3b3bCgsLFRcXJ5vNJkmy2Wzq06ePCgoKvOvx8fHe/du3b6+IiAgVFhZe0dkBAID1LI2Z9957T3v27NGUKVN8tpeVlaldu3Y+28LCwnTixAlJ0ldffVXnOgAAaDoCrXri6upqzZ49W48++qiCg4N91iorKxUUFOSzLSgoSC6XS5JUVVVV5/rFuN1uud3uBpgeTR2vIwC4/Or7s9aymFmyZIl69uypAQMG1Fqz2+21wsTlcnmj52LrISEhdT7n4cOHf+TUwHfOX/IEAFjPspjZsmWLysvLFRsbK0neONm+fbtGjBih8vJyn/3Ly8u9l5bCw8MvuN62bds6n7NHjx4KDQ31b+DcA/7dD1elmJgYq0cAgKue0+ms14kIy2Jm7dq1OnfunPf2woULJUnTp0/Xhx9+qOeff14ej0c2m00ej0d79+7V5MmTJUkOh0P5+flKSEiQJB0/flzHjx+Xw+Go8zkDAgIUEBBwmY4ITQmvIwC4/Or7s9aymOnQoYPP7WbNmkmSunTporCwMD311FPKyMjQ7373O+Xm5qqyslLDhg2TJCUlJWncuHGKiYlRr169lJGRoYEDB6pTp05X/DgAAIC1LP9o9oU0b95cy5cv9559KSwsVE5OjvcSUWxsrObOnavs7GwlJSWpZcuWyszMtHhqAABgBZvH4/FYPcTl5nQ6dfDgQUVFRfn9npm4B9c08FQwWX5WstUjAMBVr77/fjfKMzMAAAD1RcwAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACM5lfMJCcn68yZM7W2nzp1SgkJCT96KAAAgPoKrO+O//znP7Vv3z5J0ocffqhly5bV+nXcn376qb744ouGnRAAAKAO9Y6ZyMhIrVixQh6PRx6PR3v37tU111zjXbfZbAoNDVVGRsZlGRQAAOBC6h0znTp10po1ayRJaWlpSk9PV/PmzS/bYAAAAPVR75j5vszMTElSWVmZzp07J4/H47MeERHx4ycDAACoB79i5t1339Ujjzyi48ePS5I8Ho9sNpv3fw8ePNigQwIAAFyMXzEzd+5c9e7dW0uXLuVSEwAAsJRfMXPixAmtWLFCnTp1auh5AAAAfhC/vmcmPj5e+fn5DT0LAADAD+bXmZmf/exneuyxx/SPf/xDXbp08fmItiSlpqY2yHAAAACX4vcbgHv27KmTJ0/q5MmTPms2m61BBgMAAKgPv2Jm7dq1DT0HAACAX/yKmVdffbXO9VGjRvnzsAAAAD+YXzHz7LPP+tx2u906efKkAgMD1bt3b2IGAABcMX7FzFtvvVVr2zfffKNHH31UN954448eCgAAoL78+mj2hTRr1kxTp07VqlWrGuohAQAALqnBYkaSDh06pJqamoZ8SAAAgDr5dZlp3LhxtT6C/c033+jjjz/W+PHjG2IuAACAevErZvr161drW1BQkKZPn65bb731Rw8FAABQX37FzPe/4ffs2bNyu91q2bJlgw0FAABQX37FjCStXr1aK1asUHl5uSSpdevWSkpK4lcZAACAK8qvmMnOzta6dev0pz/9SbGxsaqpqdHevXu1ZMkSBQUFaeLEiQ09JwAAwAX5FTMbNmxQRkaGBg0a5N0WFRWl8PBwZWRkEDMAAOCK8euj2WfPnlXXrl1rbY+MjNSpU6d+7EwAAAD15lfMxMbG6oUXXvD5Thm3262VK1eqd+/eDTYcAADApfh1mSktLU333HOPdu3apejoaEnSgQMH5HK5tGLFigYdEAAAoC5+xUy3bt00a9YsVVRU6OjRo7Lb7Xr77bf17LPP6qabbmroGQEAAC7Kr8tMa9eu1Zw5c3Tttddqzpw5SktL07hx4zR9+nRt2LChoWcEAAC4KL9iZtWqVXrqqaf0m9/8xrvtoYceUlZWlnJychpsOAAAgEvxK2ZOnz6tzp0719oeGRnp/RI9AACAK8GvmImLi9PixYtVWVnp3VZdXa1ly5YpNja23o/z6aef6r777lNsbKwGDhzo8+bhkpISjR8/XjExMRo+fLh27tzpc99du3ZpxIgRcjgcSk5OVklJiT+HAgAADOdXzDz66KPav3+/+vfvr8TERCUmJqp///766KOP9Oijj9brMWpqajRx4kS1atVKf/nLX/TYY49p6dKl2rx5szwej1JSUtSmTRvl5eVp5MiRSk1NVWlpqSSptLRUKSkpSkhI0KZNm9S6dWtNmTJFHo/Hn8MBAAAG8+vTTJ07d9bWrVv1zjvv6NixYwoMDFTXrl3Vv39/BQQE1OsxysvLFRUVpTlz5qh58+bq2rWrbr31VuXn56tNmzYqKSlRbm6uQkND1a1bN7333nvKy8vT1KlTtXHjRvXs2VMTJkyQJGVmZurnP/+5du/efcHf6A0AAK5efv+iyaCgIP3qV7/y+4nbtWunZ555RpLk8Xi0d+9effjhh5o9e7YKCwt18803KzQ01Lt/XFycCgoKJEmFhYWKj4/3roWEhCg6OloFBQXEDAAATYxfl5ka2qBBg3T33XcrNjZWQ4YMUVlZmdq1a+ezT1hYmE6cOCFJl1wHAABNh99nZhrSs88+q/Lycs2ZM0eZmZmqrKxUUFCQzz5BQUFyuVySdMn1i3G73XK73Q07PJokXkcAcPnV92dto4iZXr16SfruE1HTp09XYmKizyelJMnlcik4OFiSZLfba4WLy+VSixYt6nyew4cPN+DUaMrOX/IEAFjPspgpLy9XQUGBBg8e7N3WvXt3ffvtt2rbtq2OHj1aa//zl5bCw8NrfZ/N+TcU16VHjx4+78P5QXIP+Hc/XJViYmKsHgEArnpOp7NeJyIsi5nPP/9cqamp2rFjh8LDwyVJ+/fvV+vWrRUXF6cXXnhBVVVV3rMx+fn5iouLkyQ5HA7l5+d7H6uyslJFRUVKTU2t8zkDAgLq/WkroC68jgDg8qvvz1rL3gDcq1cvRUdHa9asWTpy5Ih27NihrKwsTZ48WX379lX79u2Vlpam4uJi5eTkaN++fRo9erQkKTExUXv37lVOTo6Ki4uVlpamjh078kkmAACaIMtiJiAgQM8995xCQkI0duxYpaena9y4cUpOTvaulZWVKSEhQa+//rqys7MVEREhSerYsaMWL16svLw8jR49WhUVFcrOzpbNZrPqcAAAgEVsnibwtblOp1MHDx5UVFSU3++ZiXtwTQNPBZPlZyVbPQIAXPXq++93o/ieGQAAAH8RMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwmqUx8+WXX2ratGnq27evBgwYoMzMTFVXV0uSSkpKNH78eMXExGj48OHauXOnz3137dqlESNGyOFwKDk5WSUlJVYcAgAAsJhlMePxeDRt2jRVVlbqpZde0tNPP623335bzzzzjDwej1JSUtSmTRvl5eVp5MiRSk1NVWlpqSSptLRUKSkpSkhI0KZNm9S6dWtNmTJFHo/HqsMBAAAWCbTqiY8ePaqCggK9++67atOmjSRp2rRpevLJJ/WLX/xCJSUlys3NVWhoqLp166b33ntPeXl5mjp1qjZu3KiePXtqwoQJkqTMzEz9/Oc/1+7du9WvXz+rDgkAAFjAsjMzbdu21YoVK7whc97Zs2dVWFiom2++WaGhod7tcXFxKigokCQVFhYqPj7euxYSEqLo6GjvOgAAaDosOzPTokULDRgwwHu7pqZG69at0y233KKysjK1a9fOZ/+wsDCdOHFCki65fjFut1tut7uBjgBNGa8jALj86vuz1rKY+W9ZWVkqKirSpk2b9OKLLyooKMhnPSgoSC6XS5JUWVlZ5/rFHD58uGGHRpPFWUAAaDwaRcxkZWVp9erVevrpp9WjRw/Z7XZVVFT47ONyuRQcHCxJstvttcLF5XKpRYsWdT5Pjx49fC5d/SC5B/y7H65KMTExVo8AAFc9p9NZrxMRlsfMvHnz9PLLLysrK0tDhgyRJIWHh+vIkSM++5WXl3svLYWHh6u8vLzWelRUVJ3PFRAQoICAgAacHk0VryMAuPzq+7PW0u+ZWbJkiXJzc7Vo0SLdcccd3u0Oh0MHDhxQVVWVd1t+fr4cDod3PT8/37tWWVmpoqIi7zoAAGg6LIuZTz75RM8995z+8Ic/KC4uTmVlZd4/ffv2Vfv27ZWWlqbi4mLl5ORo3759Gj16tCQpMTFRe/fuVU5OjoqLi5WWlqaOHTvysWwAAJogy2LmzTfflNvt1tKlS9W/f3+fPwEBAXruuedUVlamhIQEvf7668rOzlZERIQkqWPHjlq8eLHy8vI0evRoVVRUKDs7WzabzarDAQAAFrF5msDX5jqdTh08eFBRUVF+vwE47sE1DTwVTJaflWz1CABw1avvv9/8okkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGC7R6AAD+43eG4fv4nWFoqjgzAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAwGjEDAACMRswAAACjNYqYcblcGjFihD744APvtpKSEo0fP14xMTEaPny4du7c6XOfXbt2acSIEXI4HEpOTlZJScmVHhsAADQClsdMdXW1/vznP6u4uNi7zePxKCUlRW3atFFeXp5Gjhyp1NRUlZaWSpJKS0uVkpKihIQEbdq0Sa1bt9aUKVPk8XisOgwAAGARS2PmyJEj+u1vf6vPPvvMZ/v777+vkpISzZ07V926ddOkSZMUExOjvLw8SdLGjRvVs2dPTZgwQTfccIMyMzP1xRdfaPfu3VYcBgAAsJClMbN7927169dP69ev99leWFiom2++WaGhod5tcXFxKigo8K7Hx8d710JCQhQdHe1dBwAATUeglU9+9913X3B7WVmZ2rVr57MtLCxMJ06cqNc6AABoOiyNmYuprKxUUFCQz7agoCC5XK56rV+M2+2W2+1u2GHRJPE6QmPE6xJXm/q+phtlzNjtdlVUVPhsc7lcCg4O9q7/d7i4XC61aNGizsc9fPhwg86JpotLmmiMeF2iqWqUMRMeHq4jR474bCsvL/deWgoPD1d5eXmt9aioqDoft0ePHj7vw/lBcg/4dz9clWJiYqwe4Tu8LvE9jeZ1CTQQp9NZrxMRjTJmHA6HcnJyVFVV5T0bk5+fr7i4OO96fn6+d//KykoVFRUpNTW1zscNCAhQQEDA5RscTQavIzRGvC5xtanva9ry75m5kL59+6p9+/ZKS0tTcXGxcnJytG/fPo0ePVqSlJiYqL179yonJ0fFxcVKS0tTx44d1a9fP4snBwAAV1qjjJmAgAA999xzKisrU0JCgl5//XVlZ2crIiJCktSxY0ctXrxYeXl5Gj16tCoqKpSdnS2bzWbx5AAA4EprNJeZPv74Y5/bXbp00bp16y66/y9/+Uv98pe/vNxjAQCARq5RnpkBAACoL2IGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABit0Xw0GwBgvrgH11g9AhqR/KzkK/I8nJkBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYjZgBAABGI2YAAIDRiBkAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAQAARiNmAACA0YgZAABgNGIGAAAYzeiYqa6u1qxZsxQfH6/+/fvrhRdesHokAABwhQVaPcCPsWDBAu3fv1+rV69WaWmpHnroIUVERGjo0KFWjwYAAK4QY2PG6XRq48aNev755xUdHa3o6GgVFxfrpZdeImYAAGhCjL3MdOjQIZ07d06xsbHebXFxcSosLFRNTY2FkwEAgCvJ2JgpKytTq1atFBQU5N3Wpk0bVVdXq6KiwrrBAADAFWXsZabKykqfkJHkve1yuXy2nz9T880338jtdvv1fJ1bBft1P1yd/vOf/1g9giRel/DVGF6XvCbxfT/2NVlVVSVJl7ziYmzM2O32WtFy/nZwsO9/TNXV1ZKkzz77zO/ne3hIN7/vi6vP4cOHrR5BEq9L+GoMr0tek/i+hnpNVldXq3nz5hddNzZmwsPDdfr0aZ07d06Bgd8dRllZmYKDg9WiRQuffVu2bKmuXbvKbrfrJz8x9soaAABNSk1Njaqrq9WyZcs69zM2ZqKiohQYGKiCggLFx8dLkvLz89WrV69awRIYGKiwsDArxgQAAD9CXWdkzjP2NEVISIhGjRqlOXPmaN++ffr73/+uF154QcnJyVaPBgAAriCbx+PxWD2EvyorKzVnzhz99a9/VfPmzXXfffdp/PjxVo911fn222+1bNkyvfrqq/ryyy/Vpk0bDRkyRFOnTq1XMQOXw9dff62lS5fqr3/9q06ePKmIiAiNHTtWycnJXE4GmhijYwZXRmZmpnbt2qVZs2apU6dOKikpUUZGhjp27Khly5ZZPR6aoNOnT2vs2LFq166dUlJS1LFjR3300UeaN2+ehg8frkceecTqEdEEnTx5UkuXLtWbb76pU6dOqWPHjkpISNC9997rfW8nLg9iBpfUt29fzZ8/X4MHD/Zu27Nnj+655x698847ateunYXToSl6+OGHVVBQoLy8PNntdu/2t956S1OmTNH//d//KTIy0sIJ0dR8+eWXSkpKUmRkpP74xz8qPDxcH330kRYuXKhu3bpp+fLlnDG8jPibxSXZbDa9//77Pp/zj42N1ZYtW9SqVSsLJ0NT5HK5tGXLFt1zzz0+ISNJt99+u1588UV16NDBounQVM2fP18dOnRQTk6O4uPj1alTJw0fPlzr1q3Tnj179PLLL1s94lWNmMElJScna+3atRo0aJBmz56t7du3q6qqSt27d9c111xj9XhoYj777DM5nU716tWr1prNZtMtt9xS6ws1gcvp9OnT+vvf/64//OEPCggI8FmLiIhQYmKiNmzYYNF0TQMxg0tKSUlRVlaWrr/+em3YsEHTpk3TgAEDlJeXZ/VoaILOnDkjSbr22mstngT4zoEDB3Tu3Dn17t37gut9+vTRoUOHan3RKxoOMYN6ueuuu5Sbm6tdu3Zp4cKFuuGGG5Senq79+/dbPRqamOuuu07Sd59mAhqD06dPS5KaNWt2wfXzX/h2fj80PGIGdTp06JCeeOIJ7+1WrVrpzjvv1Nq1a3X99dfr/ffft3A6NEWdO3fWtddeqwMHDlxw/Y9//KN27dp1hadCU3Y+sL/88ssLrnM28fIjZlAnt9utVatWqaioyGd7UFCQgoOD1bp1a4smQ1MVGBio4cOH66WXXqp12v6tt97SW2+9xSfscEVFR0crMDDwomeq//WvfykyMlKhoaFXeLKmg5hBnaKjozVw4EBNmTJFmzdv1ueff66CggLNnj1bLpdLv/71r60eEU3Q1KlTdfbsWd13333avXu3PvvsM23cuFEzZ85UcnKyunfvbvWIaEJat26twYMHa9myZTp37pwkae3atbr//vu1e/du/eUvf9GYMWMsnvLqxvfM4JIqKyu1bNkybdu2TaWlpQoNDVX//v31wAMPKCIiwurx0EQdP35cixcv1s6dO1VRUaHOnTvrd7/7nZKSkmp9ogS43L766islJSWpc+fOSklJUYsWLTRnzhzl5+erc+fO2rp1K5/+vIyIGQAAGsDJkyeVnZ2tN998U6dPn1ZERIQGDRqkv/3tb+rcubMyMzO5BHqZEDMAAFxGTqdT69ev19ixY3nfzGVCzAAAAKPxBmAAAGA0YgYAABiNmAEAAEYjZgAAgNGIGQAAYDRiBgAAGI2YAdAgBg0apBtvvPGCfz744IMf9FivvPKKBg0a1GCzffDBB7rxxhsb7PH8OSYAl0+g1QMAuHrMmjVLw4cPr7W9ZcuWFkwDoKkgZgA0mGuvvVZt27a1egwATQyXmQBcEYMGDdKmTZuUmJio3r17a8KECfriiy80depUORwOjRw5UsXFxT73WbRokfr06aMBAwZo7dq13u0ul0uZmZkaMGCAoqOjNWjQIK1fv97nubKystS/f3+NGjVK//1F55mZmRo4cKBKS0slSXv27FFCQoJ69+6tO++8U9u3b/fZf8mSJbr11lvVr18/bdy4saH/agD8SMQMgCvmmWee0QMPPKD//d//VVFRkX7zm9/otttu06ZNmxQSEqJFixZ59/3iiy/08ccfa/369frzn/+sJ5980vs+lZycHP3jH//Q4sWLtW3bNo0aNUrz5s1TeXm59/6bN2/WypUr9cQTT8hms3m3r1q1Sq+99ppWrlypiIgIlZWVadKkSUpISNDmzZt1//33a+bMmdqzZ48kaf369VqzZo3mz5+vF198UXl5eVfobwtAfXGZCUCDmT17tubNm+ezLSIiQlu2bJEkJSQk6LbbbpMk3XLLLSorK1NSUpIk6a677tLq1au997Pb7XriiSfUqlUr3XDDDdq9e7dyc3PVr18/3XTTTbrlllsUExMjSZo8ebKys7N17NgxtWnTxvt459/0ez6Ctm7dqiVLlujFF19Ut27dJEkvvfSSbrvtNv3+97+XJHXp0kUHDx7U6tWrFR8frw0bNujee+/V7bffLkl6/PHHdccddzT43x0A/xEzABrMtGnT9Otf/9pnW2Dg//sx06lTJ+//Dw4OVocOHXxuf/vttz77tmrVynv75ptv9l7iGTx4sN5991098cQTOnr0qIqKiiRJbrfbu//3H/u8mTNnKigoSNdff71329GjR/X2228rNjbWu+3bb79VZGSkJOmTTz5RSkqKd6179+785mOgkSFmADSYsLAwdenS5aLrAQEBPrd/8pOLX+n+77Wamhpdc801kqSnn35aGzduVEJCgkaNGqXZs2fX+ii33W6v9ZhZWVlasWKFnnzySS1cuFCSdO7cOd15552aPHmyz77fj7D/fs/N99cAWI/3zABolEpKSlRZWem9vW/fPv30pz+VJOXm5uqRRx7R9OnTNXz4cO9+/x0d/23IkCF6+OGHtWXLFn344YeSpMjISH366afq0qWL98+bb76pzZs3S5JuuOEGffTRR97H+Pzzz3XmzJkGPVYAPw4xA6DB/Oc//1FZWVmtP06n8wc/VnV1tR566CEVFxcrNzdX27dv17333itJuu666/T222+rpKREe/bs0YwZMyR99ymnSzn/yam5c+fq3Llzuvvuu7V//349/fTTOnbsmDZv3qxFixYpIiJCkvT73/9ea9as0fbt23X48GGlp6fXeUYJwJXHuVIADWb+/PmaP39+re1/+tOffvBjRUVFKTw8XL/97W/VqlUrzZ8/Xz179vQ+z5w5c3THHXcoPDxcY8aMUUBAgA4ePKhf/OIXl3zsBx54QEOGDNHatWv1P//zP1q2bJkWLlyolStXKjw8XDNnztRdd90lSRo5cqROnz6tefPmqaqqShMnTtShQ4d+8PEAuHxsnkudlwUAAGjEOFcKAACMRswAAACjETMAAMBoxAwAADAaMQMAAIxGzAAAAKMRMwAAwGjEDAAAMBoxAwAAjEbMAAAAoxEzAADAaMQMAAAw2v8HOjvIgZF7/0cAAAAASUVORK5CYII=","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":27,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked_C
\n","
Embarked_Q
\n","
Embarked_S
\n","
\n"," \n"," \n","
\n","
0
\n","
1
\n","
0
\n","
3
\n","
Braund, Mr. Owen Harris
\n","
male
\n","
22.0
\n","
1
\n","
0
\n","
A/5 21171
\n","
7.2500
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
1
\n","
2
\n","
1
\n","
1
\n","
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n","
female
\n","
38.0
\n","
1
\n","
0
\n","
PC 17599
\n","
71.2833
\n","
C85
\n","
True
\n","
False
\n","
False
\n","
\n","
\n","
2
\n","
3
\n","
1
\n","
3
\n","
Heikkinen, Miss. Laina
\n","
female
\n","
26.0
\n","
0
\n","
0
\n","
STON/O2. 3101282
\n","
7.9250
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
3
\n","
4
\n","
1
\n","
1
\n","
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n","
female
\n","
35.0
\n","
1
\n","
0
\n","
113803
\n","
53.1000
\n","
C123
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
4
\n","
5
\n","
0
\n","
3
\n","
Allen, Mr. William Henry
\n","
male
\n","
35.0
\n","
0
\n","
0
\n","
373450
\n","
8.0500
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked_C Embarked_Q Embarked_S \n","0 0 A/5 21171 7.2500 NaN False False True \n","1 0 PC 17599 71.2833 C85 True False False \n","2 0 STON/O2. 3101282 7.9250 NaN False False True \n","3 0 113803 53.1000 C123 False False True \n","4 0 373450 8.0500 NaN False False True "]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked_C
\n","
Embarked_Q
\n","
Embarked_S
\n","
\n"," \n"," \n","
\n","
0
\n","
892
\n","
3
\n","
Kelly, Mr. James
\n","
male
\n","
34.5
\n","
0
\n","
0
\n","
330911
\n","
7.8292
\n","
NaN
\n","
False
\n","
True
\n","
False
\n","
\n","
\n","
1
\n","
893
\n","
3
\n","
Wilkes, Mrs. James (Ellen Needs)
\n","
female
\n","
47.0
\n","
1
\n","
0
\n","
363272
\n","
7.0000
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
2
\n","
894
\n","
2
\n","
Myles, Mr. Thomas Francis
\n","
male
\n","
62.0
\n","
0
\n","
0
\n","
240276
\n","
9.6875
\n","
NaN
\n","
False
\n","
True
\n","
False
\n","
\n","
\n","
3
\n","
895
\n","
3
\n","
Wirz, Mr. Albert
\n","
male
\n","
27.0
\n","
0
\n","
0
\n","
315154
\n","
8.6625
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
4
\n","
896
\n","
3
\n","
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
\n","
female
\n","
22.0
\n","
1
\n","
1
\n","
3101298
\n","
12.2875
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Pclass Name Sex \\\n","0 892 3 Kelly, Mr. James male \n","1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n","2 894 2 Myles, Mr. Thomas Francis male \n","3 895 3 Wirz, Mr. Albert male \n","4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n","\n"," Age SibSp Parch Ticket Fare Cabin Embarked_C Embarked_Q \\\n","0 34.5 0 0 330911 7.8292 NaN False True \n","1 47.0 1 0 363272 7.0000 NaN False False \n","2 62.0 0 0 240276 9.6875 NaN False True \n","3 27.0 0 0 315154 8.6625 NaN False False \n","4 22.0 1 1 3101298 12.2875 NaN False False \n","\n"," Embarked_S \n","0 False \n","1 True \n","2 False \n","3 True \n","4 True "]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 A/5 21171\n","1 PC 17599\n","2 STON/O2. 3101282\n","3 113803\n","4 373450\n","5 330877\n","6 17463\n","7 349909\n","8 347742\n","9 237736\n","10 PP 9549\n","11 113783\n","12 A/5. 2151\n","13 347082\n","14 350406\n","15 248706\n","16 382652\n","17 244373\n","18 345763\n","19 2649\n","Name: Ticket, dtype: object"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":30,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[{"data":{"text/plain":["'A5'"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set\n","tickets = []\n","for i in list(test_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","test_df[\"Ticket\"] = tickets"]},{"cell_type":"code","execution_count":32,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 A5\n","1 PC\n","2 STONO2\n","3 x\n","4 x\n","5 x\n","6 x\n","7 x\n","8 x\n","9 x\n","10 PP\n","11 x\n","12 A5\n","13 x\n","14 x\n","15 x\n","16 x\n","17 x\n","18 x\n","19 x\n","Name: Ticket, dtype: object"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":33,"metadata":{},"outputs":[{"data":{"text/plain":["0 x\n","1 x\n","2 x\n","3 x\n","4 x\n","5 x\n","6 x\n","7 x\n","8 x\n","9 A4\n","10 x\n","11 x\n","12 x\n","13 x\n","14 WEP\n","15 SCPARIS\n","16 x\n","17 x\n","18 STONO2\n","19 x\n","Name: Ticket, dtype: object"]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":34,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[{"data":{"text/html":["
"],"text/plain":[" PassengerId Name Age SibSp \\\n","0 892 Kelly, Mr. James 34.5 0 \n","1 893 Wilkes, Mrs. James (Ellen Needs) 47.0 1 \n","2 894 Myles, Mr. Thomas Francis 62.0 0 \n","3 895 Wirz, Mr. Albert 27.0 0 \n","4 896 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 22.0 1 \n","\n"," Parch Fare Cabin Embarked_C Embarked_Q Embarked_S ... \\\n","0 0 7.8292 NaN False True False ... \n","1 0 7.0000 NaN False False True ... \n","2 0 9.6875 NaN False True False ... \n","3 0 8.6625 NaN False False True ... \n","4 1 12.2875 NaN False False True ... \n","\n"," TcktName_STONO2 TcktName_STONOQ TcktName_WC TcktName_WEP TcktName_x \\\n","0 False False False False True \n","1 False False False False True \n","2 False False False False True \n","3 False False False False True \n","4 False False False False True \n","\n"," Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male \n","0 False False True False True \n","1 False False True True False \n","2 False True False False True \n","3 False False True False True \n","4 False False True True False \n","\n","[5 rows x 43 columns]"]},"execution_count":40,"metadata":{},"output_type":"execute_result"}],"source":["test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Drop Passenger ID and Cabin (Assignment)"]},{"cell_type":"code","execution_count":41,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.207602Z","iopub.status.busy":"2024-04-01T06:27:56.207299Z","iopub.status.idle":"2024-04-01T06:27:56.215886Z","shell.execute_reply":"2024-04-01T06:27:56.214401Z","shell.execute_reply.started":"2024-04-01T06:27:56.207550Z"},"trusted":true},"outputs":[],"source":["train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":42,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.217917Z","iopub.status.busy":"2024-04-01T06:27:56.217536Z","iopub.status.idle":"2024-04-01T06:27:56.228150Z","shell.execute_reply":"2024-04-01T06:27:56.227230Z","shell.execute_reply.started":"2024-04-01T06:27:56.217854Z"},"trusted":true},"outputs":[{"data":{"text/plain":["Index(['Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',\n"," 'Embarked_Q', 'Embarked_S', 'TcktName_A4', 'TcktName_A5', 'TcktName_AS',\n"," 'TcktName_C', 'TcktName_CA', 'TcktName_CASOTON', 'TcktName_FC',\n"," 'TcktName_FCC', 'TcktName_Fa', 'TcktName_LINE', 'TcktName_PC',\n"," 'TcktName_PP', 'TcktName_PPP', 'TcktName_SC', 'TcktName_SCA4',\n"," 'TcktName_SCAH', 'TcktName_SCOW', 'TcktName_SCPARIS',\n"," 'TcktName_SCParis', 'TcktName_SOC', 'TcktName_SOP', 'TcktName_SOPP',\n"," 'TcktName_SOTONO2', 'TcktName_SOTONOQ', 'TcktName_SP', 'TcktName_STONO',\n"," 'TcktName_STONO2', 'TcktName_SWPP', 'TcktName_WC', 'TcktName_WEP',\n"," 'TcktName_x', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',\n"," 'Sex_male'],\n"," dtype='object')"]},"execution_count":42,"metadata":{},"output_type":"execute_result"}],"source":["train_df.columns"]},{"cell_type":"code","execution_count":43,"metadata":{},"outputs":[],"source":["# Drop the PassengerId and Cabin columns from the test set\n","test_df.drop(labels=[\"PassengerId\", \"Cabin\"], axis=1, inplace=True)"]},{"cell_type":"code","execution_count":44,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q',\n"," 'Embarked_S', 'TcktName_A', 'TcktName_A4', 'TcktName_A5',\n"," 'TcktName_AQ3', 'TcktName_AQ4', 'TcktName_C', 'TcktName_CA',\n"," 'TcktName_FC', 'TcktName_FCC', 'TcktName_LP', 'TcktName_PC',\n"," 'TcktName_PP', 'TcktName_SC', 'TcktName_SCA3', 'TcktName_SCA4',\n"," 'TcktName_SCAH', 'TcktName_SCPARIS', 'TcktName_SCParis', 'TcktName_SOC',\n"," 'TcktName_SOPP', 'TcktName_SOTONO2', 'TcktName_SOTONOQ',\n"," 'TcktName_STONO', 'TcktName_STONO2', 'TcktName_STONOQ', 'TcktName_WC',\n"," 'TcktName_WEP', 'TcktName_x', 'Pclass_1', 'Pclass_2', 'Pclass_3',\n"," 'Sex_female', 'Sex_male'],\n"," dtype='object')"]},"execution_count":44,"metadata":{},"output_type":"execute_result"}],"source":["# Print the columns of the test set\n","test_df.columns"]},{"cell_type":"markdown","metadata":{},"source":[" \n","# Modeling"]},{"cell_type":"code","execution_count":45,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.230086Z","iopub.status.busy":"2024-04-01T06:27:56.229809Z","iopub.status.idle":"2024-04-01T06:27:56.238557Z","shell.execute_reply":"2024-04-01T06:27:56.237679Z","shell.execute_reply.started":"2024-04-01T06:27:56.230040Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.svm import SVC\n","from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.metrics import accuracy_score"]},{"cell_type":"markdown","metadata":{},"source":["## Train - Test Split (Assignment)"]},{"cell_type":"code","execution_count":46,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.240103Z","iopub.status.busy":"2024-04-01T06:27:56.239830Z","iopub.status.idle":"2024-04-01T06:27:56.256809Z","shell.execute_reply":"2024-04-01T06:27:56.255463Z","shell.execute_reply.started":"2024-04-01T06:27:56.240056Z"},"trusted":true},"outputs":[{"data":{"text/plain":["891"]},"execution_count":46,"metadata":{},"output_type":"execute_result"}],"source":["train_df_len = len(train_df)\n","train_df_len"]},{"cell_type":"code","execution_count":48,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.314730Z","iopub.status.busy":"2024-04-01T06:27:56.313986Z","iopub.status.idle":"2024-04-01T06:27:56.333564Z","shell.execute_reply":"2024-04-01T06:27:56.332507Z","shell.execute_reply.started":"2024-04-01T06:27:56.314635Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["X_train 596\n","X_test 295\n","y_train 596\n","y_test 295\n","test 418\n"]}],"source":["\n","train = train_df[:train_df_len]\n","test = test_df\n","\n","# Select all numerical values from train and test\n","numeric_train = train.select_dtypes(include=[np.number])\n","numeric_test = test.select_dtypes(include=[np.number]) \n","\n","\n","X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n","y_train = numeric_train[\"Survived\"]\n","\n","# Split the train data into train and test sets with a 1/3 ratio\n","X_train, X_test, y_train, y_test = train_test_split(numeric_train.drop(labels=[\"Survived\"], axis=1), numeric_train[\"Survived\"], test_size=0.33, random_state=42)\n","\n","\n","print(\"X_train\", len(X_train))\n","print(\"X_test\", len(X_test))\n","print(\"y_train\", len(y_train))\n","print(\"y_test\", len(y_test))\n","print(\"test\", len(numeric_test))\n"]},{"cell_type":"markdown","metadata":{},"source":["## Simple Logistic Regression (Assignment)"]},{"cell_type":"code","execution_count":49,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.335970Z","iopub.status.busy":"2024-04-01T06:27:56.335281Z","iopub.status.idle":"2024-04-01T06:27:56.368083Z","shell.execute_reply":"2024-04-01T06:27:56.366489Z","shell.execute_reply.started":"2024-04-01T06:27:56.335561Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Accuracy on the training set: 68.62%\n","Accuracy on the test set: 68.81%\n"]}],"source":["logreg = LogisticRegression()\n","logreg.fit(X_train, y_train)\n","acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n","acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n","# Print the accuracy on the training and test set\n","print(f\"Accuracy on the training set: {acc_log_train}%\")\n","print(f\"Accuracy on the test set: {acc_log_test}%\")"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Hyperparameter Tuning -- Grid Search -- Cross Validation\n","We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n","\n","* Decision Tree\n","* SVM\n","* Random Forest\n","* KNN\n","* Logistic Regression"]},{"cell_type":"code","execution_count":50,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.371066Z","iopub.status.busy":"2024-04-01T06:27:56.370400Z","iopub.status.idle":"2024-04-01T06:27:56.401742Z","shell.execute_reply":"2024-04-01T06:27:56.396867Z","shell.execute_reply.started":"2024-04-01T06:27:56.370802Z"},"trusted":true},"outputs":[],"source":["random_state = 42\n","classifier = [DecisionTreeClassifier(random_state = random_state),\n"," SVC(random_state = random_state),\n"," RandomForestClassifier(random_state = random_state),\n"," LogisticRegression(random_state = random_state),\n"," KNeighborsClassifier()]\n","\n","dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n"," \"max_depth\": range(1,20,2)}\n","\n","svc_param_grid = {\"kernel\" : [\"rbf\"],\n"," \"gamma\": [0.001, 0.01, 0.1, 1],\n"," \"C\": [1,10,50,100,200,300,1000]}\n","\n","rf_param_grid = {\"max_features\": [1,3,10],\n"," \"min_samples_split\":[2,3,10],\n"," \"min_samples_leaf\":[1,3,10],\n"," \"bootstrap\":[False],\n"," \"n_estimators\":[100,300],\n"," \"criterion\":[\"gini\"]}\n","\n","logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n"," \"penalty\": [\"l1\",\"l2\"]}\n","\n","knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n"," \"weights\": [\"uniform\",\"distance\"],\n"," \"metric\":[\"euclidean\",\"manhattan\"]}\n","classifier_param = [dt_param_grid,\n"," svc_param_grid,\n"," rf_param_grid,\n"," logreg_param_grid,\n"," knn_param_grid]"]},{"cell_type":"code","execution_count":51,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.413811Z","iopub.status.busy":"2024-04-01T06:27:56.404322Z","iopub.status.idle":"2024-04-01T06:29:38.718970Z","shell.execute_reply":"2024-04-01T06:29:38.717807Z","shell.execute_reply.started":"2024-04-01T06:27:56.413658Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 10 folds for each of 250 candidates, totalling 2500 fits\n"]},{"name":"stderr","output_type":"stream","text":["/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n"]},{"name":"stdout","output_type":"stream","text":["0.6996045197740112\n","Fitting 10 folds for each of 28 candidates, totalling 280 fits\n","0.7130508474576271\n","Fitting 10 folds for each of 54 candidates, totalling 540 fits\n","0.7081073446327684\n","Fitting 10 folds for each of 14 candidates, totalling 140 fits\n","0.6777683615819209\n","Fitting 10 folds for each of 40 candidates, totalling 400 fits\n","0.6979943502824858\n"]}],"source":["cv_result = []\n","best_estimators = []\n","for i in range(len(classifier)):\n"," clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n"," clf.fit(X_train,y_train)\n"," cv_result.append(clf.best_score_)\n"," best_estimators.append(clf.best_estimator_)\n"," print(cv_result[i])"]},{"cell_type":"code","execution_count":52,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:38.722928Z","iopub.status.busy":"2024-04-01T06:29:38.722207Z","iopub.status.idle":"2024-04-01T06:29:39.075423Z","shell.execute_reply":"2024-04-01T06:29:39.073987Z","shell.execute_reply.started":"2024-04-01T06:29:38.722582Z"},"trusted":true},"outputs":[{"data":{"text/plain":["Text(0.5, 1.0, 'Cross Validation Scores')"]},"execution_count":52,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n"," \"LogisticRegression\",\n"," \"KNeighborsClassifier\"]})\n","\n","g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n","g.set_xlabel(\"Mean Accuracy\")\n","g.set_title(\"Cross Validation Scores\")"]},{"cell_type":"markdown","metadata":{},"source":["## Ensemble Modeling (Assignment)"]},{"cell_type":"code","execution_count":53,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.078654Z","iopub.status.busy":"2024-04-01T06:29:39.077840Z","iopub.status.idle":"2024-04-01T06:29:39.862871Z","shell.execute_reply":"2024-04-01T06:29:39.860937Z","shell.execute_reply.started":"2024-04-01T06:29:39.078554Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Accuracy of the voting classifier on the test set: 70.85%\n"]}],"source":["votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n"," (\"rfc\",best_estimators[2]),\n"," (\"lr\",best_estimators[3])],\n"," voting = \"soft\", n_jobs = -1)\n","votingC = votingC.fit(X_train, y_train)\n","\n","# Print the accuracy score of the voting classifier\n","acc_votingC = round(votingC.score(X_test, y_test) * 100, 2)\n","print(f\"Accuracy of the voting classifier on the test set: {acc_votingC}%\")"]},{"cell_type":"code","execution_count":56,"metadata":{},"outputs":[],"source":["# Drop the null values which are going to cause you an error in the next cell\n","# Drop rows with missing values in numeric_test\n","numeric_test_dropna = numeric_test.dropna()"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Prediction and Submission"]},{"cell_type":"code","execution_count":57,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.865981Z","iopub.status.busy":"2024-04-01T06:29:39.865330Z","iopub.status.idle":"2024-04-01T06:29:39.977357Z","shell.execute_reply":"2024-04-01T06:29:39.973301Z","shell.execute_reply.started":"2024-04-01T06:29:39.865906Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":[" PassengerId Survived\n","0 892 0.0\n","1 893 0.0\n","2 894 0.0\n","3 895 0.0\n","4 896 0.0\n",".. ... ...\n","413 1305 1.0\n","414 1306 0.0\n","415 1307 0.0\n","416 1308 0.0\n","417 1309 NaN\n","\n","[418 rows x 2 columns]\n"]}],"source":["test_survived = pd.Series(votingC.predict(numeric_test_dropna), name=\"Survived\").astype(int)\n","results = pd.concat([test_PassengerId, test_survived], axis=1)\n","results.to_csv(\"titanic.csv\", index=False)\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["# Congratulations on finishing the assignment!!\n","\n","### The submission is the titanic.csv which was just created, and this file which you have modified."]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"databundleVersionId":26502,"sourceId":3136,"sourceType":"competition"}],"dockerImageVersionId":29852,"isGpuEnabled":false,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.4"}},"nbformat":4,"nbformat_minor":4}
From 7775bd4b8431fb74037deec9800ff8eefbc763c6 Mon Sep 17 00:00:00 2001
From: Faheem <“faheemuddinsayyed789@gmail.com”>
Date: Thu, 4 Apr 2024 20:06:53 +0530
Subject: [PATCH 4/4] Update
---
titanic.ipynb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/titanic.ipynb b/titanic.ipynb
index 7b03595..8e3f37f 100644
--- a/titanic.ipynb
+++ b/titanic.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":[" \n","# Ignore this"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n"]}],"source":["import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import matplotlib.pyplot as plt\n","plt.style.use(\"seaborn-v0_8-whitegrid\")\n","\n","import seaborn as sns\n","\n","from collections import Counter\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")"]},{"cell_type":"markdown","metadata":{},"source":[" \n","# Load and Check Data"]},{"cell_type":"markdown","metadata":{},"source":["DataFrames hold the dataset in a tabular format for easy manipulation and analysis. \n","CSV data is read into 'df' using Pandas' read_csv() function."]},{"cell_type":"code","execution_count":3,"metadata":{"_kg_hide-input":true,"execution":{"iopub.execute_input":"2024-04-01T06:45:27.416192Z","iopub.status.busy":"2024-04-01T06:45:27.415763Z","iopub.status.idle":"2024-04-01T06:45:27.433162Z","shell.execute_reply":"2024-04-01T06:45:27.431944Z","shell.execute_reply.started":"2024-04-01T06:45:27.416105Z"},"trusted":true},"outputs":[],"source":["train_df = pd.read_csv(\"./data/train.csv\")"]},{"cell_type":"markdown","metadata":{},"source":["### 1. Try to read the test .csv file into test_df"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.436611Z","iopub.status.busy":"2024-04-01T06:45:27.435916Z","iopub.status.idle":"2024-04-01T06:45:27.449974Z","shell.execute_reply":"2024-04-01T06:45:27.448230Z","shell.execute_reply.started":"2024-04-01T06:45:27.436517Z"},"trusted":true},"outputs":[],"source":["test_df = pd.read_csv(\"./data/test.csv\")\n","test_PassengerId = test_df[\"PassengerId\"]"]},{"cell_type":"code","execution_count":5,"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","execution":{"iopub.execute_input":"2024-04-01T06:45:27.452397Z","iopub.status.busy":"2024-04-01T06:45:27.451949Z","iopub.status.idle":"2024-04-01T06:45:27.462622Z","shell.execute_reply":"2024-04-01T06:45:27.461859Z","shell.execute_reply.started":"2024-04-01T06:45:27.452348Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["The Columns of train_df are: \n"]},{"data":{"text/plain":["Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n"," 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n"," dtype='object')"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["print(\"The Columns of train_df are: \")\n","train_df.columns"]},{"cell_type":"markdown","metadata":{},"source":["### We can use head() to see the first few rows in the dataframe"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.464289Z","iopub.status.busy":"2024-04-01T06:45:27.463866Z","iopub.status.idle":"2024-04-01T06:45:27.491984Z","shell.execute_reply":"2024-04-01T06:45:27.491110Z","shell.execute_reply.started":"2024-04-01T06:45:27.464242Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked
\n","
\n"," \n"," \n","
\n","
0
\n","
1
\n","
0
\n","
3
\n","
Braund, Mr. Owen Harris
\n","
male
\n","
22.0
\n","
1
\n","
0
\n","
A/5 21171
\n","
7.2500
\n","
NaN
\n","
S
\n","
\n","
\n","
1
\n","
2
\n","
1
\n","
1
\n","
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n","
female
\n","
38.0
\n","
1
\n","
0
\n","
PC 17599
\n","
71.2833
\n","
C85
\n","
C
\n","
\n","
\n","
2
\n","
3
\n","
1
\n","
3
\n","
Heikkinen, Miss. Laina
\n","
female
\n","
26.0
\n","
0
\n","
0
\n","
STON/O2. 3101282
\n","
7.9250
\n","
NaN
\n","
S
\n","
\n","
\n","
3
\n","
4
\n","
1
\n","
1
\n","
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n","
female
\n","
35.0
\n","
1
\n","
0
\n","
113803
\n","
53.1000
\n","
C123
\n","
S
\n","
\n","
\n","
4
\n","
5
\n","
0
\n","
3
\n","
Allen, Mr. William Henry
\n","
male
\n","
35.0
\n","
0
\n","
0
\n","
373450
\n","
8.0500
\n","
NaN
\n","
S
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S "]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["train_df.head()"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.494326Z","iopub.status.busy":"2024-04-01T06:45:27.493637Z","iopub.status.idle":"2024-04-01T06:45:27.541999Z","shell.execute_reply":"2024-04-01T06:45:27.541210Z","shell.execute_reply.started":"2024-04-01T06:45:27.494251Z"},"jupyter":{"source_hidden":true},"trusted":true},"outputs":[{"data":{"text/html":["
"],"text/plain":[" PassengerId Pclass Name Sex \\\n","0 892 3 Kelly, Mr. James male \n","1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n","2 894 2 Myles, Mr. Thomas Francis male \n","3 895 3 Wirz, Mr. Albert male \n","4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n","\n"," Age SibSp Parch Ticket Fare Cabin Embarked \n","0 34.5 0 0 330911 7.8292 NaN Q \n","1 47.0 1 0 363272 7.0000 NaN S \n","2 62.0 0 0 240276 9.6875 NaN Q \n","3 27.0 0 0 315154 8.6625 NaN S \n","4 22.0 1 1 3101298 12.2875 NaN S "]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["### 3. Now try checking for a description of test_df's data"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"data":{"text/html":["
Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton )
\n","\n"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","RangeIndex: 891 entries, 0 to 890\n","Data columns (total 12 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 PassengerId 891 non-null int64 \n"," 1 Survived 891 non-null int64 \n"," 2 Pclass 891 non-null int64 \n"," 3 Name 891 non-null object \n"," 4 Sex 891 non-null object \n"," 5 Age 714 non-null float64\n"," 6 SibSp 891 non-null int64 \n"," 7 Parch 891 non-null int64 \n"," 8 Ticket 891 non-null object \n"," 9 Fare 891 non-null float64\n"," 10 Cabin 204 non-null object \n"," 11 Embarked 889 non-null object \n","dtypes: float64(2), int64(5), object(5)\n","memory usage: 83.7+ KB\n"]}],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":11,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[{"data":{"text/plain":["PassengerId 3\n","Survived 1\n","Pclass 3\n","Name Heikkinen, Miss. Laina\n","Sex female\n","Age 26.0\n","SibSp 0\n","Parch 0\n","Ticket STON/O2. 3101282\n","Fare 7.925\n","Cabin NaN\n","Embarked S\n","Name: 2, dtype: object"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"data":{"text/plain":["PassengerId 6\n","Survived 0\n","Pclass 3\n","Name Moran, Mr. James\n","Sex male\n","Age NaN\n","SibSp 0\n","Parch 0\n","Ticket 330877\n","Fare 8.4583\n","Cabin NaN\n","Embarked Q\n","Name: 5, dtype: object"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["# Print the 5th Row\n","train_df.iloc[5]"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 male\n","1 female\n","2 female\n","3 female\n","4 male\n"," ... \n","886 male\n","887 female\n","888 female\n","889 male\n","890 male\n","Name: Sex, Length: 891, dtype: object"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 Braund, Mr. Owen Harris\n","1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n","2 Heikkinen, Miss. Laina\n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n","4 Allen, Mr. William Henry\n"," ... \n","886 Montvila, Rev. Juozas\n","887 Graham, Miss. Margaret Edith\n","888 Johnston, Miss. Catherine Helen \"Carrie\"\n","889 Behr, Mr. Karl Howell\n","890 Dooley, Mr. Patrick\n","Name: Name, Length: 891, dtype: object"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# Print the Name Column\n","train_df['Name']"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","sns.countplot(x='Pclass', data=train_df)\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":17,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","sns.countplot(x='Embarked', data=train_df)\n","plt.title('Count of Passengers by Embarkation Point')\n","plt.xlabel('Embarkation Point')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['C', 'Q', 'S'])\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":20,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[{"data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAq8AAAIcCAYAAADYP0dGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAABDWUlEQVR4nO3deVxUhR738e8IAoO4gmhabhmuiIiipqbyWFlqi0vdzGx5CkttuVkuZbaYmVm2aRYterv2ZNe1zLLbnrkmpmKigktQlDHmEs4AMpznD2OuCCgOzHLg8369eCVnzpzf7/wc6OuZc+ZYDMMwBAAAAJhADV83AAAAAJQX4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwA4gz/cu8UfegAAf0R4BWAqt9xyi9q0aeP6atu2rWJjYzV06FC9++67KigoKLZ+QkKCJk+eXO7tf/nll5o0adI515s8ebISEhLcrlOW48ePa+LEidqyZYtr2S233KJbbrmlwtuuLAUFBZo8ebJiY2PVpUsXbdy4scx1c3JyFBMTow4dOig7O9uLXQKoqgJ93QAAnK/27dvr8ccflyQ5nU4dO3ZM3333nWbOnKktW7bopZdeUo0ap/5tPnfuXIWFhZV72wsXLizXemPHjtXo0aPPu/dzSU1N1Ycffqhhw4a5lhXtq79Yu3atVqxYobFjx+rSSy9V+/bty1z3448/Vu3ateV0OrV06VLdc889XuwUQFVEeAVgOmFhYercuXOxZQkJCWrVqpVmzJihjz/+WNdcc40knTVYVUSzZs08st3StG7d2mu1yuPo0aOSpKFDh+qiiy4667rLly9Xnz59VLNmTS1ZskRjxoxx/cMCANzBbxAAVcaoUaPUqFEjLV682LXszLfzi4Jtp06d1KNHDz300EM6dOiQpFNvz2/evFmbN29WmzZttGnTJm3atElt2rTR4sWL1b9/f3Xp0kXr1q0rcdqAJJ08eVJPP/20unXrpq5du2rSpEn6888/XY+X9vZ/0faLahUdzR09erRr3TOfl5eXp3nz5mngwIGKjo7WFVdcoaSkJBUWFhar9eijjyopKUn9+vVTdHS0/vGPf2jHjh1nnaHT6dR7772nIUOGqFOnTurXr5+ef/555eXlSTp1ukTRPAcMGHDW0xnS09O1fft29evXT9dcc41+/fVXrV27tsR6f/zxh/75z38qPj5e3bp107Rp0/Tiiy+WmO+SJUs0aNAgdezYUf369dOrr74qp9N51v0BUPVw5BVAlVGjRg317NlTq1evVkFBgQIDi/+KS05O1sSJEzV27Fh169ZNv//+u2bPnq0JEyZo0aJFevzxx/Xwww9LOvVWfevWrfXTTz9JOnX6wdSpU5Wbm6vY2FitWrWqRP1PP/1UMTExevbZZ/Xnn3/q+eefV3p6uv7zn/8oICDgnP136NBB06ZN01NPPaVp06ape/fuJdYxDEN33323tm3bpvHjx6tt27batGmTXnrpJWVmZmr69OmudT/77DNdfPHFmjp1qgzD0KxZs3Tvvffqq6++KrOfadOm6cMPP9Rdd92lrl27ateuXZo3b55SU1P11ltvaezYsWrcuLHmz5+vuXPnqmXLlmXuz7Jly1SvXj31799fQUFBat68ud5//3317dvXtU5+fr5uvfVW2e12PfLIIwoLC1NSUpJSU1PVsGFD13pvvPGGXnzxRY0aNUpTpkxRamqqXn31Vf3222965plnzjlbAFUH4RVAlRIREaGTJ0/q6NGjioiIKPZYcnKyQkJClJiYqKCgIElSvXr1lJKSIsMw1Lp1a9f5sWeeljBy5EgNHDjwrLXr16+vt99+W6Ghoa7vx40bp++++079+/c/Z+9hYWGuUwRat25d6ukC3333ndavX685c+Zo0KBBkqRevXopJCREL7/8skaPHq1LLrlE0qkLq95++23XPp04cUKTJk1SamqqOnbsWGLb6enpWrp0qSZMmKDExETXtiMjIzVx4kR999136tu3r+uUiXbt2unCCy8sdV8KCgr00UcfafDgwa5ZX3/99a7AecEFF0iSPvroI+3fv1/Lli1z9dSjRw8NGDDAta2//vpLr732mm688UZNnTpVktS7d2/Vq1dPU6dO1e233+7aZwBVH6cNAKhSij5iymKxlHisW7ducjgcGjx4sF544QVt2bJFvXv31vjx40td/3Tt2rU7Z+2+ffu6gqt06pSFwMBA/fDDD+e5F2XbvHmzAgMDSwTponN8N2/e7Fp2ehiXpEaNGkmSHA5HmduW5ArFRQYNGqSAgABt2rSp3H1+8803stlsGjBggI4fP67jx48rISFBhYWFWrJkiWu9jRs36qKLLioWpsPCwoqF/R9//FG5ublKSEhQQUGB66votIJ169aVuy8A5seRVwBVyqFDhxQSEqJ69eqVeCw2NlZJSUlauHChFixYoKSkJEVEROjuu+8+50dRnR5Ky3L629zSqdMY6tevr+PHj5/XPpzNsWPHVL9+/RJv+xfV/uuvv1zLrFZriX4kFTs39sxtn76tIoGBgapfv36xbZ/LsmXLJEm33XZbiceWLl2qsWPHKjAwUEeOHFF4eHiJdU5fVnSBWNHR4DP98ccf5e4LgPkRXgFUGQUFBdq0aZO6dOlS5jmdffr0UZ8+feRwOLRx40a9++67evrppxUTE6NOnTpVqH5RyCridDpLhLMzLzCy2+3nVaNu3bo6cuSInE5nsX0sCnD169c/z66Lb1uSsrOz1bRpU9fykydP6siRI+Xets1m03fffVfqqRbbtm3TnDlz9PXXX+vyyy9Xo0aNdPDgwRLbOHz4sOvPderUkSQ9//zzatGiRYl1zzw9BEDVxmkDAKqMDz74QNnZ2brppptKfXzWrFkaNmyYDMOQ1WpV//79XTckyMrKkqQKfYzTunXrit0k4bPPPlNBQYHrwquwsDD9/vvvxZ6TnJxc7PtzXdgVHx+vgoICrVmzptjyjz76SJIUFxfndv/x8fGSpNWrVxdbvnr1ajmdznJv+8MPP1RBQYFuvfVWde/evdjXrbfeqrCwMNcnQsTHx+uXX35Ramqq6/m5ubnFPpUgJiZGNWvW1KFDhxQdHe36CgwM1Jw5c/TLL7+4vc8AzIcjrwBMJycnR9u2bZN06i3wI0eO6Pvvv9cHH3yga665RldccUWpz+vRo4cWLFigyZMn65prrtHJkyf11ltvqV69eurRo4ekU0f5fvzxR23YsOG8PyM2Oztb9957r2655RYdPHhQc+bMUa9evdSzZ09JUv/+/fXVV19p5syZSkhI0JYtW7Ry5cpi26hdu7akU+eM1q1bV23bti32+GWXXabu3btr6tSpOnTokNq2bavNmzfrzTff1PXXX1+hz4Rt3bq1rr/+er3yyityOBzq1q2bUlNTNXfuXHXv3l19+vQp13aWL1+uDh06lHqUNCQkRFdeeaWWL1+uzMxMDR48WElJSRo3bpzuv/9+1alTRwsWLNDhw4fVpEkTSaeOJt955516+eWXlZOTo+7du+vQoUN6+eWXZbFYSswIQNVGeAVgOrt27dKNN94o6dSFWbVq1VJUVJSeeOIJjRgxoszn9e3bV88//7zeeecd10VacXFxevfdd13nyN58883auXOn7rrrLs2cOVORkZHl7mvkyJH666+/NG7cOAUFBWnIkCF6+OGHXReDDRs2TBkZGVqxYoUWL16sbt266ZVXXil2pPiSSy7R4MGD9d5772nt2rX6+OOPi9WwWCx644039Morr2jhwoX6888/deGFF+rBBx/U7bffXu5eyzJjxgw1b95cy5Yt05tvvqnIyEiNHj1aY8eOLddR6e3btys9PV0TJ04sc53rrrtOy5Yt0wcffKCHHnpIb7/9tmbMmKEnnnhCgYGBuuaaa1SvXj0dOHDA9ZwHHnhADRs21P/7f/9Pb731lurWrauePXvqwQcfdAV+ANWDxSi6NBcAAC9LS0vT/v37dcUVVxT7xIfhw4ercePGmjt3rg+7A+CPOPIKAPAZu92u+++/XyNHjtTll18up9OpTz75RDt37tRDDz3k6/YA+CGOvAIAfGrNmjV6++23tW/fPhmGofbt2+uee+5R7969fd0aAD9EeAUAAIBp8FFZAAAAMA3CKwAAAEyD8AoAAADTqBafNlBQUKBjx44pODi4QnfPAQAAgGcUFhYqLy9PdevWVWBg2RG1WoTXY8eOlXrvbAAAAPiXFi1aKDw8vMzHq0V4DQ4OlnRqGFar1SM1nE6n9u7dq6ioqHPem7y6YTZlYzalYy5lYzZlYzZlYzZlYzal88VcHA6HDh486MptZakW4bXoVAGr1arQ0FCP1HA6nZKk0NBQXvxnYDZlYzalYy5lYzZlYzZlYzZlYzal8+VcznWKJyeAAgAAwDQIrwAAADANwisAAABMg/AKAAAA0yC8AgAAwDT8Irzm5+dr8ODB2rRpk2tZZmambrvtNnXu3FlXX321vv/++2LPWb9+vQYPHqyYmBiNHj1amZmZ3m4bAAAAXubz8JqXl6cHH3xQaWlprmWGYWjcuHGKiIjQsmXLdO2112r8+PHKysqSJGVlZWncuHEaOnSoli5dqgYNGmjs2LEyDMNXuwEAAAAv8Gl4TU9P1w033KCMjIxiyzdu3KjMzEw99dRTuvjiizVmzBh17txZy5YtkyQtWbJEHTt21B133KFLLrlEM2fO1K+//qrNmzf7YjcAAADgJT4Nr5s3b1b37t31wQcfFFu+fft2tW/fvtgNBeLi4rRt2zbX4127dnU9ZrVa1aFDB9fjAAAAqJp8eoetkSNHlro8OztbkZGRxZaFh4fr999/L9fjZXE6na47RlS2ou16avtmxmzKxmxKx1zKxmzKxmzKxmzKxmxK54u5lLeWX94e1uFwKCgoqNiyoKAg5efnl+vxsuzdu7dyGy1FSkqKx2uYFbMpG7MpHXMpG7MpG7MpG7MpG7MpnT/OxS/Da3BwsI4ePVpsWX5+vkJCQlyPnxlU8/PzVadOnbNuNyoqqtipCJXJ6XQqJSVF0dHR3Bv5DMymbMymdMylbMymbMymbMymbMymdL6Yi91uL9eBRr8Mr40aNVJ6enqxZTabzXWqQKNGjWSz2Uo83q5du7NuNyAgwON/Ad6oYVbMpmzMpnTMpWzMpmzMpmzMpmzMpnTenEt56/j8o7JKExMTo59++km5ubmuZcnJyYqJiXE9npyc7HrM4XBo165drscBAABQNflleI2Pj9cFF1ygKVOmKC0tTUlJSdqxY4eGDx8uSRo2bJi2bt2qpKQkpaWlacqUKbrwwgvVvXt3H3cOAAAAT/LL8BoQEKDXXntN2dnZGjp0qD766CPNmzdPTZo0kSRdeOGFevXVV7Vs2TINHz5cR48e1bx582SxWHzcOQAAADzJb8553bNnT7HvmzdvrkWLFpW5ft++fdW3b19PtwUAAAA/4pdHXgEAAIDS+M2RV6C6y8jIKPEpGp4UERGhZs2aea0eAACVgfAK+IGMjAy179BeuY7cc69cSUKsIdqzew8BFgBgKoRXwA/YbDblOnLVLrGdQpt45kYap7Nn2ZWalCqbzUZ4BQCYCuEV8COhTUJVu0VtX7cBAIDf4oItAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKYR6OsGAPhOamqq12pFRESoWbNmXqsHAKiaCK9ANZR/NF+ySKNGjfJazRBriPbs3kOABQBUCOEVqIYK7AWSIbUY3ULhrcI9Xs+eZVdqUqpsNhvhFQBQIYRXoBqzNraqdovavm4DAIByI7wC8JrynmPrdDq1d+9eFRYWKiAgwK1anGMLAFUT4RWAx3GOLQCgshBeAXjc+Z5jaxiGHA6HrFarLBbLedfjHFsAqLoIrwC8przn2BqGIUuORWFhYW6FVwBA1cVNCgAAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApuHX4fW3337TmDFj1KVLFyUkJGjhwoWux3bt2qURI0YoJiZGw4YN086dO33XKAAAALzCr8PrAw88oNDQUC1fvlyPPPKIXnrpJX3++eey2+1KTExU165dtXz5csXGxmrMmDGy2+2+bhkAAAAe5Lfh9dixY9q2bZvuuecetWjRQgMGDFCfPn20YcMGffLJJwoODtbEiRN18cUX69FHH1WtWrW0Zs0aX7cNAAAADwr0dQNlCQkJkdVq1fLlyzVhwgRlZmZq69ateuCBB7R9+3bFxcXJYrFIkiwWi7p06aJt27Zp6NChZW7T6XTK6XR6pN+i7Xpq+2bGbMpWNJPCwkJJkmEYMgzD43UNGUV/8M96xv/+63ru+dT7u4Ynf+Z9hZ+nsjGbsjGbsjGb0vliLuWt5bfhNTg4WNOmTdP06dP17rvvyul0aujQoRoxYoS+/PJLtW7dutj64eHhSktLO+s29+7d68mWJUkpKSker2FWzKZs6enpkiSHwyFLjsXj9XJzc0/Vy3MoJyfHb+vlnHCvN4fDIenUz3yNGn77BlOF8PNUNmZTNmZTNmZTOn+ci9+GV0nat2+f+vfvr9tvv11paWmaPn26evbsKYfDoaCgoGLrBgUFKT8//6zbi4qKUmhoqEd6dTqdSklJUXR0tAICAjxSw6yYTdmKZlP0jzGr1aqwsDCP1z0RcuJUvWA/rWecCq5htcIkN7K8YT115DUqKkqdO3c+/w34MX6eysZsysZsysZsSueLudjt9nIdaPTb8LphwwYtXbpU3377rUJCQhQdHa1Dhw5p/vz5uuiii0oE1fz8fIWEhJx1mwEBAR7/C/BGDbNiNmUrOjposVhcp8N4kqUoEVrkl/Vcpwq42V/Rc6rya64q71tFMZuyMZuyMZvSeXMu5a3jt++n7dy5U82bNy8WSNu3b6+srCw1atRINput2Po2m02RkZHebhMAAABe5LdHXiMjI/Xzzz8rPz/fdYrA/v37deGFFyomJkZvvvmmDMOQxWKRYRjaunWr7r77bh93jaokIyOjxD+SKpvT6dTevXtdF2wBAICz89vwmpCQoNmzZ2vq1Km65557dODAAb3++uv65z//qYEDB+qFF17QjBkz9I9//EOLFy+Ww+HQVVdd5eu2UUVkZGSoTds2ynXkerXuuc7bBgCguvPb8Fq7dm0tXLhQM2bM0PDhw9WgQQPdc889uvHGG2WxWPTGG2/o8ccf13/+8x+1adNGSUlJHrsYC9WPzWZTriNX7RLbKbSJ515XhmHI4XDIke7QweUHVVBQ4LFaAABUBX4bXiWpdevWWrBgQamPderUSStWrPByR6huQpuEqnaL2h7bvmEYsuRYZBz1/GetAgBQFfjtBVsAAADAmQivAAAAMA3CKwAAAEyD8AoAAADTILwCAADANAivAAAAMA3CKwAAAEyD8AoAAADTILwCAADANAivAAAAMA3CKwAAAEyD8AoAAADTILwCAADANAivAAAAMA3CKwAAAEyD8AoAAADTILwCAADANAivAAAAMA3CKwAAAEyD8AoAAADTILwCAADANAivAAAAMA3CKwAAAEwj0NcNAICnpKameq1WRESEmjVr5rV6AFBdEV4BVDn5R/MlizRq1Civ1QyxhmjP7j0EWADwMMIrgCqnwF4gGVKL0S0U3irc4/XsWXalJqXKZrMRXgHAwwivAKosa2Orareo7es2AACViAu2AAAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACmQXgFAACAaRBeAQAAYBqEVwAAAJgG4RUAAACm4dfhNT8/X08++aS6deumSy+9VHPmzJFhGJKkXbt2acSIEYqJidGwYcO0c+dOH3cLAAAAT/Pr8Pr0009r/fr1evvtt/XCCy/oP//5jz744APZ7XYlJiaqa9euWr58uWJjYzVmzBjZ7XZftwwAAAAPCvR1A2U5evSoli1bpgULFqhTp06SpDvuuEPbt29XYGCggoODNXHiRFksFj366KP67rvvtGbNGg0dOtTHnQMAAMBT/Da8JicnKywsTPHx8a5liYmJkqTHHntMcXFxslgskiSLxaIuXbpo27ZtZw2vTqdTTqfTI/0WbddT2zczM86mqFfDMFynqnjEmZs25Nl6rjKGf9cz/vdfo8SQPFCvgopqePJ3TBEz/jx5C7MpG7MpG7MpnS/mUt5afhteMzMz1bRpU61cuVKvv/66Tp48qaFDh+qee+5Rdna2WrduXWz98PBwpaWlnXWbe/fu9WTLkqSUlBSP1zArM82m6LXicDhkybF4vF5ubu6penkO5eTkUO9vOSfc683b++dwOCSdet3UqOGds7HM9PPkbcymbMymbMymdP44F78Nr3a7XT///LMWL16smTNnKjs7W9OmTZPVapXD4VBQUFCx9YOCgpSfn3/WbUZFRSk0NNQj/TqdTqWkpCg6OloBAQEeqWFWZpxNYWGhJMlqtSosLMxzhYxTAS0kJORUvWAP1/vbiZAT/l3v77mE1QqT3Pi3g7f3z7CeOvIaFRWlzp07e7SWGX+evIXZlI3ZlI3ZlM4Xc7Hb7eU60Oi34TUwMFA5OTl64YUX1LRpU0lSVlaW3n//fTVv3rxEUM3Pz3cFgLIEBAR4/C/AGzXMykyzKerTYrG4Tk/xhBJviVvk0Xr/K2Px63quubjZn9f37+8a3nyNm+nnyduYTdmYTdmYTem8/XutPPz20wYaNmyo4OBgV3CVpJYtW+q3335To0aNZLPZiq1vs9kUGRnp7TYBAADgRX4bXmNiYpSXl6cDBw64lu3fv19NmzZVTEyMfvzxR9dFEoZhaOvWrYqJifFVuwAAAPACvw2vrVq1Ur9+/TRlyhTt3r1ba9euVVJSkm666SYNHDhQx48f14wZM5Senq4ZM2bI4XDoqquu8nXbAAAA8CC/Da+S9Pzzz6tZs2a66aabNGnSJN1888265ZZbFBYWpjfeeEPJyckaOnSotm/frqSkJI9djAUAAAD/4LcXbElS7dq19dxzz5X6WKdOnbRixQovdwQAAABf8usjrwAAAMDp3AqvGzdu9MpdawAAAIDTuXXawP3336+aNWtq4MCBGjx4sMc/lBsAAACQ3Ayv69at07p167RmzRolJiYqLCxMV111lQYNGqT27dtXdo8AAACAJDfDa2BgoPr27au+ffuqoKBA69ev11dffaWRI0eqUaNGGjJkiIYOHaomTZpUdr8AAACoxip0wVZ+fr6+/fZbrV69Wp9++qnq16+vhIQEHTx4UIMGDdKiRYsqq08AAADAvSOvX3zxhdasWaNvvvlGNWvW1JVXXql58+apa9eurnXee+89zZkzR6NGjaq0ZgEAAFC9uRVeJ02apAEDBmjOnDnq1auXAgICSqzTsWNH3X777RVuEAAAACjiVnhdv369cnJydPz4cVdw/eSTT9StWzc1bNhQkhQTE6OYmJjK6xQAAADVnlvnvG7dulWXX365Vq1a5Vr27rvv6uqrr1ZycnKlNQcAAACczq3wOmvWLN1999267777XMsWL16sO++8U88880ylNQcAAACczq3wevDgQQ0cOLDE8quuukrp6ekVbgoAAAAojVvhtVWrVvr0009LLP/qq6/UrFmzCjcFAAAAlMatC7YeeOABjR07VuvWrVOHDh0kSXv27NGWLVv06quvVmqDAAAAQBG3jrxedtllWrFihdq3b6/9+/crIyNDbdu21erVq9W3b9/K7hEAAACQ5OaRV0m65JJLNHny5MrsBQAAADgrt8Lr8ePH9c477yglJUUFBQUyDKPY4++++26lNAcAAACczq3wOnHiRKWkpGjIkCEKCwur7J4AAACAUrl9h61FixapU6dOld0PAAAAUCa3Lthq1KiRatRw66kAAACA29w+beCJJ57Qfffdp+bNm6tmzZrFHm/SpEmlNAcAAACczq3weu+990qSEhMTJUkWi0WSZBiGLBaLUlNTK6k9AAAA4H/cCq9ffvllZfcBAAAAnJNbJ642bdpUTZs2ld1u165du1S/fn0VFhaqSZMmatq0aWX3CAAAAEhy88jrsWPHdP/992vz5s2SpM8++0wzZsxQZmamkpKSCLAAAADwCLeOvD799NOyWq3auHGjgoODJUnPPPOMGjdurKeffrpSGwQAAACKuBVe165dqwcffFB16tRxLWvQoIGmTJmiH374odKaAwAAAE7n9oe15uXllVj2559/KjDQrTMRAAAAgHNyK7wOHjxYM2bMUFpamiwWi+x2uzZu3KjHHntMV199dWX3CAAAAEiqwE0K5syZo6FDh+rkyZO69tprFRAQoBEjRmjixImV3SMAAAAgyc3wGhQUpMmTJ+uBBx5QZmamnE6nLrroItWqVauy+wMAAABc3AqvpV2UtWvXLtefu3Xr5n5HAAAAQBncCq+33HJLqcuDgoLUsGFD7sAFAAAAj3ArvO7evbvY906nUxkZGZo+fbqGDBlSKY0BAAAAZ3L7o7JOFxAQoJYtW2ry5Ml6+eWXK2OTAAAAQAmVEl6LHD58WMePH6/MTQIAAAAubp02MGXKlBLLTpw4ofXr12vgwIEVbgoAAAAoTaXdDqtevXqaNGmSrr322sraJAAAAFCMW+F15syZld0HAAAAcE5uhde5c+eWe93x48e7UwIAAAAowa3w+vPPP2vNmjWqV6+eOnbsqKCgIO3evVsZGRnq3LmzAgNPbdZisVRqswAAAKje3L497JAhQ/Tkk0+qZs2aruWzZs3SsWPH9Mwzz1RagwAAAEARtz4q65NPPtGdd95ZLLhK0g033KBPPvmkUhoDAAAAzuRWeG3UqJHWrl1bYvlnn32miy66qMJNAQAAAKVx67SBCRMm6IEHHtA333yjtm3bSpJSUlK0a9cuvf7665XaIAAAAFDErSOvl19+uZYvX66oqCjt27dPv/76q+Lj4/XZZ58pPj6+snsEAAAAJFXgJgVt2rTRlClTdOzYMYWFhalGjRp8ugAAAAA8yq0jr4ZhaP78+erevbt69uyprKwsPfzww5o2bZry8/Mru0cAAABAkpvhdd68efroo4/07LPPKigoSJJ0/fXXa926dXruuecqtUEAAACgiFvhdcWKFXrqqafUv39/16kCvXr10qxZs/Tpp59WaoMAAABAEbfC6+HDhxUZGVlieZ06dWS32yvcFAAAAFAat8Jrjx499PbbbxdblpOTozlz5qh79+6V0hgAAABwJrfC6xNPPKFdu3apV69eysvL09ixY9W3b1/9+uuvmjp1amX3CAAAAEhy86Oy6tSpo6VLl2rDhg3av3+/CgoK1LJlS/Xu3Vs1ariVhwEAAIBzciu8Dh48WHPnzlXPnj3Vs2fPyu4JAAAAKJVbh0lr1KihkydPVnYvAAAAwFm5deS1X79+uv3229W/f381bdrU9VmvRcaPH18pzQEAAACncyu87tmzRx06dNAff/yhP/74o9hj3CIWAAAAnlLu8HrzzTdr/vz5qlOnjv79739LknJzcxUSEuKx5gAAAIDTlTu8JicnlzjP9dJLL9WHH36oiy66qNIbAwCzSU1N9XgNp9OpvXv3qrCwUI0aNVKzZs08XhMA/Ilbpw0UMQyjsvoAANPKP5ovWaRRo0Z5tW6INUR7du8hwAKoVioUXgEAUoG9QDKkFqNbKLxVuEdrGYYhh8Mh46ih3Um7ZbPZCK8AqhXCKwBUEmtjq2q3qO3RGoZhyJJjkWHlnS8A1dN5hddPP/1UYWFhru8LCwv1+eefq0GDBsXWu+666yqlOQAAAOB05Q6vTZo00TvvvFNsWXh4uBYtWlRsmcViIbwCAADAI8odXr/66itP9gEAAACck1u3hwUAAAB8gfAKAAAA0yC8AgAAwDQIrwAAADANwisAAABMg/AKAAAA0yC8AgAAwDQIrwAAADANwisAAABMg/AKAAAA0yC8AgAAwDRME14TExM1efJk1/e7du3SiBEjFBMTo2HDhmnnzp0+7A4AAADeYIrwunr1an377beu7+12uxITE9W1a1ctX75csbGxGjNmjOx2uw+7BAAAgKf5fXg9evSonnvuOUVHR7uWffLJJwoODtbEiRN18cUX69FHH1WtWrW0Zs0aH3YKAAAAT/P78Dpr1ixde+21at26tWvZ9u3bFRcXJ4vFIkmyWCzq0qWLtm3b5qMuAQAA4A2Bvm7gbDZs2KAtW7Zo1apVeuKJJ1zLs7Ozi4VZSQoPD1daWtpZt+d0OuV0Oj3Rqmu7Rf/NyMiQzWbzSK3SREREqFmzZl6rdz7OnI0ZFPVqGIYMw/BcoTM3bciz9VxlDP+uZ/zvv0aJIXmgXgV5td4Zm/fk7zWzMePvGm9hNmVjNqXzxVzKW8tvw2teXp4ef/xxTZs2TSEhIcUeczgcCgoKKrYsKChI+fn5Z93m3r17K73PM6WkpOj333/XsOHDlJeb5/F6RYJDgrVs6TI1btzYazXPV0pKiq9bKLei14rD4ZAlx+Lxerm5uafq5TmUk5NDvb/lnHCvN7PsX0U4HA5Jp16rNWr4/ZtoXmWm3zXexmzKxmxK549z8dvwOnfuXHXs2FF9+vQp8VhwcHCJoJqfn18i5J4pKipKoaGhldpnEafTqZSUFEVHR6uwsFB5uXlqm9hWoRd4pt7p7L/ZtTtptyIjI9W5c2eP1ztfp88mICDA1+2US2FhoSTJarUqLCzMc4WMUwGt6LVrDfZwvb+dCDnh3/X+nktYrTDJjX87+P3+VcTfs7FarZJO/V7zx597XzDj7xpvYTZlYzal88Vc7HZ7uQ40+m14Xb16tWw2m2JjYyXJFVY/++wzDR48uMRb8jabTZGRkWfdZkBAgMf/Ak6vUatJLdVuUduj9SS5zv31xv5VhL/3d7qiPi0Wi2u+nlDiLXGLPFrvf2Usfl3PNRc3+/P3/auIM18zZvq58hZmUjZmUzZmUzpvzqW8dfw2vP773/9WQUGB6/vnn39ekvTQQw/phx9+0JtvvinDMGSxWGQYhrZu3aq7777bV+0CAADAC/w2vDZt2rTY97Vq1ZIkNW/eXOHh4XrhhRc0Y8YM/eMf/9DixYvlcDh01VVX+aJVAAAAeIkpz/IPCwvTG2+8oeTkZA0dOlTbt29XUlKSx85nBQAAgH/w2yOvZ3r22WeLfd+pUyetWLHCR90AAADAF0x55BUAAADVE+EVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAAphHo6wYAAO5LTU31Wq28vDwFBwd7rV5ERISaNWvmtXoAzIHwCgAmlH8sX7JIo0aN8l5RiyTDe+VCrCHas3sPARZAMYRXADChAnuBZEgtRrdQeKtwj9c7vOOwDi4/6LV69iy7UpNSZbPZCK8AiiG8AoCJWRtbVbtFbY/XsWfZvVoPAMrCBVsAAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8AAAAwDcIrAAAATCPQ1w0A5ZGRkSGbzea1eqmpqV6rBQAAyo/wCr+XkZGhNm3bKNeR6/Xa+fn5Xq8JAADKRniF37PZbMp15KpdYjuFNgn1Ss3DOw7r4PKDKigo8Eo9AABQPoRXmEZok1DVblHbK7XsWXav1AEAAOeHC7YAAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGnzaAADAb53PDUOcTqf27t2rwsJCBQQEnHetiIgINWvW7LyfB8C7CK8AAL+TfzRfskijRo3yWs0Qa4j27N5DgAX8HOEVAOB3CuwFkiG1GN1C4a3Cy/UcwzDkcDhktVplsVjOq549y67UpFTZbDbCK+DnCK8AAL9lbWwt981JDMOQJceisLCw8w6vAMyDC7YAAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGoRXAAAAmAbhFQAAAKZBeAUAAIBpEF4BAABgGn4dXg8dOqT77rtP8fHx6tOnj2bOnKm8vDxJUmZmpm677TZ17txZV199tb7//nsfdwsAAABP89vwahiG7rvvPjkcDr333nt68cUX9fXXX+ull16SYRgaN26cIiIitGzZMl177bUaP368srKyfN02AAAAPCjQ1w2UZf/+/dq2bZvWrVuniIgISdJ9992nWbNm6bLLLlNmZqYWL16s0NBQXXzxxdqwYYOWLVume++918edAwAAwFP89shrw4YN9dZbb7mCa5GcnBxt375d7du3V2hoqGt5XFyctm3b5uUuAQAA4E1+e+S1Tp066tOnj+v7wsJCLVq0SD169FB2drYiIyOLrR8eHq7ff//9rNt0Op1yOp0e6bdou6fXMAxDhmF4pN7pimp4cv8q4vTZVOT53pqnJBkyiv7g2ZpnbtrT9VxlvLR/7tYz/vdfo8SQPFCvgrxaj9fM2Z70v+ec5+vG33+PVlRFfw9XZcymdL6YS3lr+W14PdPs2bO1a9cuLV26VAsXLlRQUFCxx4OCgpSfn3/Wbezdu9eTLUqSUlJSXHUcDocsORaP13Q4HJJO7V+NGn57MF0pKSluPc/b85Sk3NzcUzXzHMrJyaGej+rlnHCvN7Psn5lqmqmeO68bs/werSh3fw9XB8ymdP44F1OE19mzZ+tf//qXXnzxRUVFRSk4OFhHjx4ttk5+fr5CQkLOup2oqKhipxpUJqfTqZSUFEVHR6uwsFCSZLVaFRYW5pF6pzOsp44YREVFqXPnzh6vd75On01AQMB5P9/b85SkEyEnTtUM9nBN49T/aIteux6v9zev7Z+79f6eS1itMMmNf6/4/f5VBK+ZslXgdePvv0crqqK/h6syZlM6X8zFbreX60Cj34fX6dOn6/3339fs2bN15ZVXSpIaNWqk9PT0YuvZbLYSpxKcKSAgwON/AafXsFgsslg8f6SwqIY39q8i3O3P2/OUJEvR//ks8mjNEm9terje/8p4Z//creeai5v9+fv+VQSvmbJV5HVjlt+jFVXV968imE3pvDmX8tbx6/dG5s6dq8WLF2vOnDkaNGiQa3lMTIx++ukn19tKkpScnKyYmBhftAkAAAAv8dvwum/fPr322mu66667FBcXp+zsbNdXfHy8LrjgAk2ZMkVpaWlKSkrSjh07NHz4cF+3DQAAAA/y29MGvvzySzmdTs2fP1/z588v9tiePXv02muv6dFHH9XQoUPVvHlzzZs3T02aNPFRtwAAAPAGvw2viYmJSkxMLPPx5s2ba9GiRV7sCAAAAL7mt6cNAAAAAGcivAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0/Pb2sAAAeFtqaqrXakVERKhZs2ZeqwdUFYRXAEC1l380X7JIo0aN8lrNEGuI9uzeQ4AFzhPhFQBQ7RXYCyRDajG6hcJbhXu8nj3LrtSkVNlsNsIrcJ4IrwAA/M3a2KraLWr7ug0AZ0F4hVsyMjJks9nKta7T6dTevXtVWFiogICA867lzXPQAACAfyO84rxlZGSoTds2ynXkerVufn6+V+sBAAD/Q3jFebPZbMp15KpdYjuFNgk95/qGYcjhcMhqtcpisZx3vcM7Duvg8oMqKChwp10AAFCFEF7httAmoeU6N8wwDFlyLAoLC3MrvNqz7O60BwAAqiBuUgAAAADT4MhrFeKtC5u4gAoAAPgK4bUK8MWHa0tcQAUAALyP8FoFePvDtbmACgAA+ArhtQrx1odrcwEVAADwFcIrAAA+4q1rCIpuFtOgQQO1bNnSKzUBTyG8AgDgZb66ViHEGqI9u/eoWbNmXq0LVCbCKwAAXubtaxUMw9CfB/7UwXcPymazEV5haoRXAAB8xFvXKhTd6RCoCrhJAQAAAEyD8AoAAADTILwCAADANAivAAAAMA3CKwAAAEyDTxsAAKAa8daNESQpIiKCj+VCpSO8AgBQDZw8ftLrN0bgpgjwBMIrAADVgLdvjGDPsis1KZWbIqDSEV4BAKhGvHVjBMBTuGALAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAagb5uAAAAoDJkZGTIZrOd13OcTqf27t2rwsJCBQQEnNdzIyIi1KxZs/N6DiqO8AoAAEwvIyNDbdq2Ua4j12s1Q6wh2rN7DwHWywivAADA9Gw2m3IduWqX2E6hTULL/TzDMORwOGS1WmWxWMr9PHuWXalJqbLZbIRXLyO8AgCAKiO0Sahqt6hd7vUNw5Alx6KwsLDzCq/wHS7YAgAAgGlw5BUAAHhMampqlaoD3yO8AgCASpd/NF+ySKNGjfJu3fx8r9aD9xFeAQBApSuwF0iG1GJ0C4W3Cvd4vcM7Duvg8oMqKCjweC34FuEVAAB4jLWx9bwuoHKXPcvu8RrwD1ywBQAAANMgvAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANPgDlsAAABuSk1N9VqtvLw8BQcHe6WW0+nUH3/84ZVa58vU4TUvL09PPvmk/vvf/yokJER33HGH7rjjDl+3BQAAqrj8o/mSRRo1apT3ilokGd4rFxwSrNRdqWrZsqX3ipaDqcPrc889p507d+pf//qXsrKyNGnSJDVp0kQDBw70dWsAAKAKK7AXSIbUYnQLhbcK93i9wzsO6+Dyg16rdyLrhHYn7ZbNZiO8Vha73a4lS5bozTffVIcOHdShQwelpaXpvffeI7wCAACvsDa2qnaL2h6vY8+ye7WeYXjxEO95Mu0FW7t371ZBQYFiY2Ndy+Li4rR9+3YVFhb6sDMAAAB4immPvGZnZ6t+/foKCgpyLYuIiFBeXp6OHj2qBg0auJYXhdkTJ07I6XR6pJ+iGjk5OXI6nWrTpo0a5jVUyJ8hHql3OkuARYVtCtW4oLHq/lnXL+tZc60KOeneLLy9f96uac21qk5AHRltDL/+O/R2PTO9Zrxdj9dM2dx93Zhl/yrCGeRUcJvgKruPFannzuvGTPvnjpD8ELVp00aFhYX666+/PF5PknJzcyXpnAchLYY/Hxc+i5UrV+rll1/W119/7VqWmZmpAQMG6Ntvv1Xjxo1dyw8fPqyDBw/6oEsAAACcjxYtWig8vOzzek175DU4OFj5+fnFlhV9HxJS/F9PdevWVYsWLRQcHKwaNUx7pgQAAECVVVhYqLy8PNWte/Yjy6YNr40aNdKRI0dUUFCgwMBTu5Gdna2QkBDVqVOn2LqBgYFnTfAAAADwvbCwsHOuY9rDkO3atVNgYKC2bdvmWpacnKzo6GiOrgIAAFRRpk15VqtV1113nZ544gnt2LFDX3zxhd555x2NHj3a160BAADAQ0wbXiVpypQp6tChg2699VY9+eSTuvfee3XFFVd4vY+8vDw98sgj6tq1q3r37q133nnH6z34m/z8fA0ePFibNm1yLcvMzNRtt92mzp076+qrr9b333/vww6969ChQ7rvvvsUHx+vPn36aObMmcrLy5NUveciST///LP+7//9v4qNjVW/fv301ltvuR6r7rM5XWJioiZPnuz6fteuXRoxYoRiYmI0bNgw7dy504fded/nn3+uNm3aFPu67777JDGb/Px8Pfnkk+rWrZsuvfRSzZkzx/WZndV5NsuXLy/xmmnTpo3atm0rqXrP5rffftOYMWPUpUsXJSQkaOHCha7H/HIuBirsqaeeMoYMGWLs3LnT+O9//2vExsYan376qa/b8pnc3Fxj3LhxRlRUlLFx40bDMAyjsLDQGDJkiDFhwgQjPT3deP31142YmBjj119/9XG3nldYWGjccMMNxp133mns3bvX+OGHH4zLL7/cePbZZ6v1XAzDMJxOp3HFFVcYEyZMMA4cOGB88803RpcuXYyPPvqo2s/mdB9//LERFRVlTJo0yTAMwzhx4oTRq1cv49lnnzXS09ON6dOnG5deeqlx4sQJH3fqPa+99poxZswY448//nB9HTt2jNkYhvHYY48ZV1xxhbF9+3Zj/fr1Rvfu3Y3333+/2s/G4XAUe71kZWUZl19+uTFjxoxqP5sbbrjBeOCBB4wDBw4Yn3/+uRETE2P897//9du5EF4r6MSJE0Z0dLQrpBmGYcybN88YNWqUD7vynbS0NOOaa64xhgwZUiy8rl+/3ujcuXOxF/ytt95qvPLKK75q1WvS09ONqKgoIzs727Vs1apVRu/evav1XAzDMA4dOmTcf//9xl9//eVaNm7cOOPxxx+v9rMpcuTIEeOyyy4zhg0b5gqvS5YsMRISEozCwkLDME79A+nyyy83li1b5stWvWrChAnGCy+8UGJ5dZ/NkSNHjPbt2xubNm1yLXvjjTeMyZMnV/vZnOn11183BgwYYOTl5VXr2Rw9etSIiooy9uzZ41o2fvx448knn/TbuZj6tAF/wJ2+itu8ebO6d++uDz74oNjy7du3q3379goNDXUti4uLK3bBXVXVsGFDvfXWW4qIiCi2PCcnp1rPRZIiIyP10ksvKSwsTIZhKDk5WT/88IPi4+Or/WyKzJo1S9dee61at27tWrZ9+3bFxcXJYrFIkiwWi7p06VKtZrNv3z61aNGixPLqPpvk5GSFhYUpPj7etSwxMVEzZ86s9rM53dGjR/Xmm29qwoQJCgoKqtazCQkJkdVq1fLly3Xy5Ent379fW7duVbt27fx2LoTXCjrXnb6qm5EjR+qRRx6R1Wottjw7O1uRkZHFloWHh+v333/3Zns+UadOHfXp08f1fWFhoRYtWqQePXpU67mcKSEhQSNHjlRsbKyuvPJKZiNpw4YN2rJli8aOHVtseXWfjWEYOnDggL7//ntdeeWVGjBggJ5//nnl5+dX+9lkZmaqadOmWrlypQYOHKj/83/+j+bNm6fCwsJqP5vTvf/++4qMjNTAgQMlVe+fqeDgYE2bNk0ffPCBYmJidNVVV+myyy7TiBEj/HYupv2cV3/hcDiKBVdJru/PvIlCdVbWnKrjjGbPnq1du3Zp6dKlWrhwIXP52yuvvCKbzaYnnnhCM2fOrPavmby8PD3++OOaNm1aiRuvVPfZZGVluWbw0ksv6ZdfftHTTz+t3Nzcaj8bu92un3/+WYsXL9bMmTOVnZ2tadOmyWq1VvvZFDEMQ0uWLNGdd97pWlbdZ7Nv3z71799ft99+u9LS0jR9+nT17NnTb+dCeK2g87nTV3UWHBxc4kh0fn5+tZvR7Nmz9a9//UsvvviioqKimMtpoqOjJZ0KbQ899JCGDRsmh8NRbJ3qNJu5c+eqY8eOxY7aFynr9051mU3Tpk21adMm1a1bVxaLRe3atVNhYaEefvhhxcfHV+vZBAYGKicnRy+88IKaNm0q6VTYf//999W8efNqPZsiKSkpOnTokAYNGuRaVp1/pjZs2KClS5fq22+/VUhIiKKjo3Xo0CHNnz9fF110kV/OhdMGKuj0O30VKetOX9VZo0aNZLPZii2z2Wwl3o6oyqZPn64FCxZo9uzZuvLKKyUxF5vNpi+++KLYstatW+vkyZNq2LBhtZ7N6tWr9cUXXyg2NlaxsbFatWqVVq1apdjY2Gr/upGkevXquc7Dk6SLL75YeXl51f5107BhQwUHB7uCqyS1bNlSv/32G6+bv61du1Zdu3YtdgvS6jybnTt3qnnz5sUCafv27ZWVleW3cyG8VhB3+iqfmJgY/fTTT8rNzXUtS05OVkxMjA+78p65c+dq8eLFmjNnTrF/7Vf3ufzyyy8aP368Dh065Fq2c+dONWjQQHFxcdV6Nv/+97+1atUqrVy5UitXrlRCQoISEhK0cuVKxcTE6Mcff3R9dqdhGNq6dWu1mc3atWvVvXv3YkfmU1NTVa9ePcXFxVXr2cTExCgvL08HDhxwLdu/f7+aNm1a7V83RXbs2KEuXboUW1adZxMZGamff/652BHW/fv368ILL/TbuZCuKog7fZVPfHy8LrjgAk2ZMkVpaWlKSkrSjh07NHz4cF+35nH79u3Ta6+9prvuuktxcXHKzs52fVXnuUinThXo0KGDHnnkEaWnp+vbb7/V7Nmzdffdd1f72TRt2lTNmzd3fdWqVUu1atVS8+bNNXDgQB0/flwzZsxQenq6ZsyYIYfDoauuusrXbXtFbGysgoODNXXqVO3fv1/ffvutnnvuOd15553VfjatWrVSv379NGXKFO3evVtr165VUlKSbrrppmo/myJpaWnFPr1DUrWeTUJCgmrWrKmpU6fqwIED+uqrr/T666/rlltu8d+5+OQDuqoYu91uTJw40ejcubPRu3dvY8GCBb5uyS+c/jmvhmEYBw8eNG6++WajY8eOxqBBg4x169b5sDvveeONN4yoqKhSvwyj+s6lyO+//26MGzfO6NKli9GrVy9j/vz5rs8UrO6zOd2kSZNcn/NqGIaxfft247rrrjOio6ON4cOHGz/99JMPu/O+vXv3GrfddpvRuXNno1evXsarr77qet1U99kcP37cePjhh43OnTsbPXv2ZDZniI6ONr777rsSy6vzbNLS0ozbbrvN6NKlizFgwABjwYIFfv2asRjG38eCAQAAAD/HaQMAAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8AAAAwDcIrAAAATIPwCgAAANMgvAIAAMA0CK8A4IeWL1+uNm3aaMmSJb5uBQD8CuEVAPzQ6tWr1axZM3344Ye+bgUA/ArhFQD8zOHDh7VhwwaNGzdOW7ZsUWZmpq9bAgC/QXgFAD+zZs0a1a5dW9dcc40iIyOLHX3Nzc3Vo48+qri4OPXp00dLlixR+/bt9csvv0iSfvvtN919992KiYlRQkKC5s6dK6fT6atdAYBKF+jrBgAAxa1evVr9+vVTjRo1lJCQoJUrV2rcuHGyWCx6+umn9eOPP+rtt99WQUGBHn30UVc4NQxD48ePV9u2bbVixQplZ2dr2rRpslgsGjdunI/3CgAqB0deAcCP/Pbbb9q6dasGDBggSbriiiuUmZmp5ORknThxQitXrtRjjz2mzp07q2vXrpo6darruRs3blRWVpamT5+uVq1aqXv37po0aZLeffddX+0OAFQ6jrwCgB9ZvXq1goOD1bt3b0lSfHy86tatqxUrVig4OFgnT55UdHS0a/3Y2FjXn/ft26ejR48qLi7OtaywsFC5ubk6cuSI6tev770dAQAPIbwCgB9ZvXq1cnNziwVQp9OpNWvWaPjw4SXWNwzD9eeCggK1atVKr732Won1ateu7ZmGAcDLCK8A4CcOHDigXbt2aerUqerevbtreXp6uv75z3/q559/Vs2aNbVz50716NFDkrRz507Xei1btlRWVpYaNGjgCqvr1q3T8uXL9dxzz3l3ZwDAQzjnFQD8xOrVq1WvXj3deOONioqKcn1dffXVat26tVatWqWhQ4dqxowZ2r59u7Zt26YZM2ZIkiwWi3r37q2mTZvq4Ycf1p49e7RlyxY99thjslqtCggI8PHeAUDlILwCgJ9YvXq1hgwZoqCgoBKP3XTTTVq/fr3GjBmjNm3a6LbbbtO9996rwYMHS5Jq1qypgIAAzZ8/X4WFhbrhhht07733qm/fvsUu6gIAs7MYp58wBQDwa1988YV69uypWrVqSZJ27NihkSNH6scff1TNmjV93B0AeB7nvAKAicydO1dff/21EhMTdeLECc2ePVsJCQkEVwDVBkdeAcBE0tPTNX36dO3YsUNBQUFKSEjQI488wqcJAKg2CK8AAAAwDS7YAgAAgGkQXgEAAGAahFcAAACYBuEVAAAApkF4BQAAgGkQXgEAAGAahFcAAACYBuEVAAAApvH/AYw8l4m0zwU5AAAAAElFTkSuQmCC","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Create the plot below\n","plt.figure(figsize=(8, 6))\n","sns.histplot(train_df['Age'], bins=20, color='green')\n","plt.title('Distribution of Age')\n","plt.xlabel('Age')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":21,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked
\n","
\n"," \n"," \n","
\n","
5
\n","
6
\n","
0
\n","
3
\n","
Moran, Mr. James
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
330877
\n","
8.4583
\n","
NaN
\n","
Q
\n","
\n","
\n","
17
\n","
18
\n","
1
\n","
2
\n","
Williams, Mr. Charles Eugene
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
244373
\n","
13.0000
\n","
NaN
\n","
S
\n","
\n","
\n","
19
\n","
20
\n","
1
\n","
3
\n","
Masselmani, Mrs. Fatima
\n","
female
\n","
NaN
\n","
0
\n","
0
\n","
2649
\n","
7.2250
\n","
NaN
\n","
C
\n","
\n","
\n","
26
\n","
27
\n","
0
\n","
3
\n","
Emir, Mr. Farred Chehab
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
2631
\n","
7.2250
\n","
NaN
\n","
C
\n","
\n","
\n","
28
\n","
29
\n","
1
\n","
3
\n","
O'Dwyer, Miss. Ellen \"Nellie\"
\n","
female
\n","
NaN
\n","
0
\n","
0
\n","
330959
\n","
7.8792
\n","
NaN
\n","
Q
\n","
\n","
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
\n","
\n","
859
\n","
860
\n","
0
\n","
3
\n","
Razi, Mr. Raihed
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
2629
\n","
7.2292
\n","
NaN
\n","
C
\n","
\n","
\n","
863
\n","
864
\n","
0
\n","
3
\n","
Sage, Miss. Dorothy Edith \"Dolly\"
\n","
female
\n","
NaN
\n","
8
\n","
2
\n","
CA. 2343
\n","
69.5500
\n","
NaN
\n","
S
\n","
\n","
\n","
868
\n","
869
\n","
0
\n","
3
\n","
van Melkebeke, Mr. Philemon
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
345777
\n","
9.5000
\n","
NaN
\n","
S
\n","
\n","
\n","
878
\n","
879
\n","
0
\n","
3
\n","
Laleff, Mr. Kristo
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
349217
\n","
7.8958
\n","
NaN
\n","
S
\n","
\n","
\n","
888
\n","
889
\n","
0
\n","
3
\n","
Johnston, Miss. Catherine Helen \"Carrie\"
\n","
female
\n","
NaN
\n","
1
\n","
2
\n","
W./C. 6607
\n","
23.4500
\n","
NaN
\n","
S
\n","
\n"," \n","
\n","
177 rows × 12 columns
\n","
"],"text/plain":[" PassengerId Survived Pclass Name \\\n","5 6 0 3 Moran, Mr. James \n","17 18 1 2 Williams, Mr. Charles Eugene \n","19 20 1 3 Masselmani, Mrs. Fatima \n","26 27 0 3 Emir, Mr. Farred Chehab \n","28 29 1 3 O'Dwyer, Miss. Ellen \"Nellie\" \n",".. ... ... ... ... \n","859 860 0 3 Razi, Mr. Raihed \n","863 864 0 3 Sage, Miss. Dorothy Edith \"Dolly\" \n","868 869 0 3 van Melkebeke, Mr. Philemon \n","878 879 0 3 Laleff, Mr. Kristo \n","888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n","\n"," Sex Age SibSp Parch Ticket Fare Cabin Embarked \n","5 male NaN 0 0 330877 8.4583 NaN Q \n","17 male NaN 0 0 244373 13.0000 NaN S \n","19 female NaN 0 0 2649 7.2250 NaN C \n","26 male NaN 0 0 2631 7.2250 NaN C \n","28 female NaN 0 0 330959 7.8792 NaN Q \n",".. ... ... ... ... ... ... ... ... \n","859 male NaN 0 0 2629 7.2292 NaN C \n","863 female NaN 8 2 CA. 2343 69.5500 NaN S \n","868 male NaN 0 0 345777 9.5000 NaN S \n","878 male NaN 0 0 349217 7.8958 NaN S \n","888 female NaN 1 2 W./C. 6607 23.4500 NaN S \n","\n","[177 rows x 12 columns]"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 86\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 1\n","Cabin 327\n","Embarked 0\n","dtype: int64"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[{"data":{"text/plain":[""]},"execution_count":24,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 S\n","1 C\n","2 S\n","3 S\n","4 S\n","Name: Embarked, dtype: object"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":26,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":27,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked_C
\n","
Embarked_Q
\n","
Embarked_S
\n","
\n"," \n"," \n","
\n","
0
\n","
1
\n","
0
\n","
3
\n","
Braund, Mr. Owen Harris
\n","
male
\n","
22.0
\n","
1
\n","
0
\n","
A/5 21171
\n","
7.2500
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
1
\n","
2
\n","
1
\n","
1
\n","
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n","
female
\n","
38.0
\n","
1
\n","
0
\n","
PC 17599
\n","
71.2833
\n","
C85
\n","
True
\n","
False
\n","
False
\n","
\n","
\n","
2
\n","
3
\n","
1
\n","
3
\n","
Heikkinen, Miss. Laina
\n","
female
\n","
26.0
\n","
0
\n","
0
\n","
STON/O2. 3101282
\n","
7.9250
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
3
\n","
4
\n","
1
\n","
1
\n","
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n","
female
\n","
35.0
\n","
1
\n","
0
\n","
113803
\n","
53.1000
\n","
C123
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
4
\n","
5
\n","
0
\n","
3
\n","
Allen, Mr. William Henry
\n","
male
\n","
35.0
\n","
0
\n","
0
\n","
373450
\n","
8.0500
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked_C Embarked_Q Embarked_S \n","0 0 A/5 21171 7.2500 NaN False False True \n","1 0 PC 17599 71.2833 C85 True False False \n","2 0 STON/O2. 3101282 7.9250 NaN False False True \n","3 0 113803 53.1000 C123 False False True \n","4 0 373450 8.0500 NaN False False True "]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked_C
\n","
Embarked_Q
\n","
Embarked_S
\n","
\n"," \n"," \n","
\n","
0
\n","
892
\n","
3
\n","
Kelly, Mr. James
\n","
male
\n","
34.5
\n","
0
\n","
0
\n","
330911
\n","
7.8292
\n","
NaN
\n","
False
\n","
True
\n","
False
\n","
\n","
\n","
1
\n","
893
\n","
3
\n","
Wilkes, Mrs. James (Ellen Needs)
\n","
female
\n","
47.0
\n","
1
\n","
0
\n","
363272
\n","
7.0000
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
2
\n","
894
\n","
2
\n","
Myles, Mr. Thomas Francis
\n","
male
\n","
62.0
\n","
0
\n","
0
\n","
240276
\n","
9.6875
\n","
NaN
\n","
False
\n","
True
\n","
False
\n","
\n","
\n","
3
\n","
895
\n","
3
\n","
Wirz, Mr. Albert
\n","
male
\n","
27.0
\n","
0
\n","
0
\n","
315154
\n","
8.6625
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
4
\n","
896
\n","
3
\n","
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
\n","
female
\n","
22.0
\n","
1
\n","
1
\n","
3101298
\n","
12.2875
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Pclass Name Sex \\\n","0 892 3 Kelly, Mr. James male \n","1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n","2 894 2 Myles, Mr. Thomas Francis male \n","3 895 3 Wirz, Mr. Albert male \n","4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n","\n"," Age SibSp Parch Ticket Fare Cabin Embarked_C Embarked_Q \\\n","0 34.5 0 0 330911 7.8292 NaN False True \n","1 47.0 1 0 363272 7.0000 NaN False False \n","2 62.0 0 0 240276 9.6875 NaN False True \n","3 27.0 0 0 315154 8.6625 NaN False False \n","4 22.0 1 1 3101298 12.2875 NaN False False \n","\n"," Embarked_S \n","0 False \n","1 True \n","2 False \n","3 True \n","4 True "]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 A/5 21171\n","1 PC 17599\n","2 STON/O2. 3101282\n","3 113803\n","4 373450\n","5 330877\n","6 17463\n","7 349909\n","8 347742\n","9 237736\n","10 PP 9549\n","11 113783\n","12 A/5. 2151\n","13 347082\n","14 350406\n","15 248706\n","16 382652\n","17 244373\n","18 345763\n","19 2649\n","Name: Ticket, dtype: object"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":30,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[{"data":{"text/plain":["'A5'"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set\n","tickets = []\n","for i in list(test_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","test_df[\"Ticket\"] = tickets"]},{"cell_type":"code","execution_count":32,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 A5\n","1 PC\n","2 STONO2\n","3 x\n","4 x\n","5 x\n","6 x\n","7 x\n","8 x\n","9 x\n","10 PP\n","11 x\n","12 A5\n","13 x\n","14 x\n","15 x\n","16 x\n","17 x\n","18 x\n","19 x\n","Name: Ticket, dtype: object"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":33,"metadata":{},"outputs":[{"data":{"text/plain":["0 x\n","1 x\n","2 x\n","3 x\n","4 x\n","5 x\n","6 x\n","7 x\n","8 x\n","9 A4\n","10 x\n","11 x\n","12 x\n","13 x\n","14 WEP\n","15 SCPARIS\n","16 x\n","17 x\n","18 STONO2\n","19 x\n","Name: Ticket, dtype: object"]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":34,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[{"data":{"text/html":["
"],"text/plain":[" PassengerId Name Age SibSp \\\n","0 892 Kelly, Mr. James 34.5 0 \n","1 893 Wilkes, Mrs. James (Ellen Needs) 47.0 1 \n","2 894 Myles, Mr. Thomas Francis 62.0 0 \n","3 895 Wirz, Mr. Albert 27.0 0 \n","4 896 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 22.0 1 \n","\n"," Parch Fare Cabin Embarked_C Embarked_Q Embarked_S ... \\\n","0 0 7.8292 NaN False True False ... \n","1 0 7.0000 NaN False False True ... \n","2 0 9.6875 NaN False True False ... \n","3 0 8.6625 NaN False False True ... \n","4 1 12.2875 NaN False False True ... \n","\n"," TcktName_STONO2 TcktName_STONOQ TcktName_WC TcktName_WEP TcktName_x \\\n","0 False False False False True \n","1 False False False False True \n","2 False False False False True \n","3 False False False False True \n","4 False False False False True \n","\n"," Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male \n","0 False False True False True \n","1 False False True True False \n","2 False True False False True \n","3 False False True False True \n","4 False False True True False \n","\n","[5 rows x 43 columns]"]},"execution_count":40,"metadata":{},"output_type":"execute_result"}],"source":["test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Drop Passenger ID and Cabin (Assignment)"]},{"cell_type":"code","execution_count":41,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.207602Z","iopub.status.busy":"2024-04-01T06:27:56.207299Z","iopub.status.idle":"2024-04-01T06:27:56.215886Z","shell.execute_reply":"2024-04-01T06:27:56.214401Z","shell.execute_reply.started":"2024-04-01T06:27:56.207550Z"},"trusted":true},"outputs":[],"source":["train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":42,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.217917Z","iopub.status.busy":"2024-04-01T06:27:56.217536Z","iopub.status.idle":"2024-04-01T06:27:56.228150Z","shell.execute_reply":"2024-04-01T06:27:56.227230Z","shell.execute_reply.started":"2024-04-01T06:27:56.217854Z"},"trusted":true},"outputs":[{"data":{"text/plain":["Index(['Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',\n"," 'Embarked_Q', 'Embarked_S', 'TcktName_A4', 'TcktName_A5', 'TcktName_AS',\n"," 'TcktName_C', 'TcktName_CA', 'TcktName_CASOTON', 'TcktName_FC',\n"," 'TcktName_FCC', 'TcktName_Fa', 'TcktName_LINE', 'TcktName_PC',\n"," 'TcktName_PP', 'TcktName_PPP', 'TcktName_SC', 'TcktName_SCA4',\n"," 'TcktName_SCAH', 'TcktName_SCOW', 'TcktName_SCPARIS',\n"," 'TcktName_SCParis', 'TcktName_SOC', 'TcktName_SOP', 'TcktName_SOPP',\n"," 'TcktName_SOTONO2', 'TcktName_SOTONOQ', 'TcktName_SP', 'TcktName_STONO',\n"," 'TcktName_STONO2', 'TcktName_SWPP', 'TcktName_WC', 'TcktName_WEP',\n"," 'TcktName_x', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',\n"," 'Sex_male'],\n"," dtype='object')"]},"execution_count":42,"metadata":{},"output_type":"execute_result"}],"source":["train_df.columns"]},{"cell_type":"code","execution_count":43,"metadata":{},"outputs":[],"source":["# Drop the PassengerId and Cabin columns from the test set\n","test_df.drop(labels=[\"PassengerId\", \"Cabin\"], axis=1, inplace=True)"]},{"cell_type":"code","execution_count":44,"metadata":{},"outputs":[{"data":{"text/plain":["Index(['Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q',\n"," 'Embarked_S', 'TcktName_A', 'TcktName_A4', 'TcktName_A5',\n"," 'TcktName_AQ3', 'TcktName_AQ4', 'TcktName_C', 'TcktName_CA',\n"," 'TcktName_FC', 'TcktName_FCC', 'TcktName_LP', 'TcktName_PC',\n"," 'TcktName_PP', 'TcktName_SC', 'TcktName_SCA3', 'TcktName_SCA4',\n"," 'TcktName_SCAH', 'TcktName_SCPARIS', 'TcktName_SCParis', 'TcktName_SOC',\n"," 'TcktName_SOPP', 'TcktName_SOTONO2', 'TcktName_SOTONOQ',\n"," 'TcktName_STONO', 'TcktName_STONO2', 'TcktName_STONOQ', 'TcktName_WC',\n"," 'TcktName_WEP', 'TcktName_x', 'Pclass_1', 'Pclass_2', 'Pclass_3',\n"," 'Sex_female', 'Sex_male'],\n"," dtype='object')"]},"execution_count":44,"metadata":{},"output_type":"execute_result"}],"source":["# Print the columns of the test set\n","test_df.columns"]},{"cell_type":"markdown","metadata":{},"source":[" \n","# Modeling"]},{"cell_type":"code","execution_count":45,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.230086Z","iopub.status.busy":"2024-04-01T06:27:56.229809Z","iopub.status.idle":"2024-04-01T06:27:56.238557Z","shell.execute_reply":"2024-04-01T06:27:56.237679Z","shell.execute_reply.started":"2024-04-01T06:27:56.230040Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.svm import SVC\n","from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.metrics import accuracy_score"]},{"cell_type":"markdown","metadata":{},"source":["## Train - Test Split (Assignment)"]},{"cell_type":"code","execution_count":46,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.240103Z","iopub.status.busy":"2024-04-01T06:27:56.239830Z","iopub.status.idle":"2024-04-01T06:27:56.256809Z","shell.execute_reply":"2024-04-01T06:27:56.255463Z","shell.execute_reply.started":"2024-04-01T06:27:56.240056Z"},"trusted":true},"outputs":[{"data":{"text/plain":["891"]},"execution_count":46,"metadata":{},"output_type":"execute_result"}],"source":["train_df_len = len(train_df)\n","train_df_len"]},{"cell_type":"code","execution_count":48,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.314730Z","iopub.status.busy":"2024-04-01T06:27:56.313986Z","iopub.status.idle":"2024-04-01T06:27:56.333564Z","shell.execute_reply":"2024-04-01T06:27:56.332507Z","shell.execute_reply.started":"2024-04-01T06:27:56.314635Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["X_train 596\n","X_test 295\n","y_train 596\n","y_test 295\n","test 418\n"]}],"source":["\n","train = train_df[:train_df_len]\n","test = test_df\n","\n","# Select all numerical values from train and test\n","numeric_train = train.select_dtypes(include=[np.number])\n","numeric_test = test.select_dtypes(include=[np.number]) \n","\n","\n","X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n","y_train = numeric_train[\"Survived\"]\n","\n","# Split the train data into train and test sets with a 1/3 ratio\n","X_train, X_test, y_train, y_test = train_test_split(numeric_train.drop(labels=[\"Survived\"], axis=1), numeric_train[\"Survived\"], test_size=0.33, random_state=42)\n","\n","\n","print(\"X_train\", len(X_train))\n","print(\"X_test\", len(X_test))\n","print(\"y_train\", len(y_train))\n","print(\"y_test\", len(y_test))\n","print(\"test\", len(numeric_test))\n"]},{"cell_type":"markdown","metadata":{},"source":["## Simple Logistic Regression (Assignment)"]},{"cell_type":"code","execution_count":49,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.335970Z","iopub.status.busy":"2024-04-01T06:27:56.335281Z","iopub.status.idle":"2024-04-01T06:27:56.368083Z","shell.execute_reply":"2024-04-01T06:27:56.366489Z","shell.execute_reply.started":"2024-04-01T06:27:56.335561Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Accuracy on the training set: 68.62%\n","Accuracy on the test set: 68.81%\n"]}],"source":["logreg = LogisticRegression()\n","logreg.fit(X_train, y_train)\n","acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n","acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n","# Print the accuracy on the training and test set\n","print(f\"Accuracy on the training set: {acc_log_train}%\")\n","print(f\"Accuracy on the test set: {acc_log_test}%\")"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Hyperparameter Tuning -- Grid Search -- Cross Validation\n","We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n","\n","* Decision Tree\n","* SVM\n","* Random Forest\n","* KNN\n","* Logistic Regression"]},{"cell_type":"code","execution_count":50,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.371066Z","iopub.status.busy":"2024-04-01T06:27:56.370400Z","iopub.status.idle":"2024-04-01T06:27:56.401742Z","shell.execute_reply":"2024-04-01T06:27:56.396867Z","shell.execute_reply.started":"2024-04-01T06:27:56.370802Z"},"trusted":true},"outputs":[],"source":["random_state = 42\n","classifier = [DecisionTreeClassifier(random_state = random_state),\n"," SVC(random_state = random_state),\n"," RandomForestClassifier(random_state = random_state),\n"," LogisticRegression(random_state = random_state),\n"," KNeighborsClassifier()]\n","\n","dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n"," \"max_depth\": range(1,20,2)}\n","\n","svc_param_grid = {\"kernel\" : [\"rbf\"],\n"," \"gamma\": [0.001, 0.01, 0.1, 1],\n"," \"C\": [1,10,50,100,200,300,1000]}\n","\n","rf_param_grid = {\"max_features\": [1,3,10],\n"," \"min_samples_split\":[2,3,10],\n"," \"min_samples_leaf\":[1,3,10],\n"," \"bootstrap\":[False],\n"," \"n_estimators\":[100,300],\n"," \"criterion\":[\"gini\"]}\n","\n","logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n"," \"penalty\": [\"l1\",\"l2\"]}\n","\n","knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n"," \"weights\": [\"uniform\",\"distance\"],\n"," \"metric\":[\"euclidean\",\"manhattan\"]}\n","classifier_param = [dt_param_grid,\n"," svc_param_grid,\n"," rf_param_grid,\n"," logreg_param_grid,\n"," knn_param_grid]"]},{"cell_type":"code","execution_count":51,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.413811Z","iopub.status.busy":"2024-04-01T06:27:56.404322Z","iopub.status.idle":"2024-04-01T06:29:38.718970Z","shell.execute_reply":"2024-04-01T06:29:38.717807Z","shell.execute_reply.started":"2024-04-01T06:27:56.413658Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 10 folds for each of 250 candidates, totalling 2500 fits\n"]},{"name":"stderr","output_type":"stream","text":["/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n","/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n"]},{"name":"stdout","output_type":"stream","text":["0.6996045197740112\n","Fitting 10 folds for each of 28 candidates, totalling 280 fits\n","0.7130508474576271\n","Fitting 10 folds for each of 54 candidates, totalling 540 fits\n","0.7081073446327684\n","Fitting 10 folds for each of 14 candidates, totalling 140 fits\n","0.6777683615819209\n","Fitting 10 folds for each of 40 candidates, totalling 400 fits\n","0.6979943502824858\n"]}],"source":["cv_result = []\n","best_estimators = []\n","for i in range(len(classifier)):\n"," clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n"," clf.fit(X_train,y_train)\n"," cv_result.append(clf.best_score_)\n"," best_estimators.append(clf.best_estimator_)\n"," print(cv_result[i])"]},{"cell_type":"code","execution_count":52,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:38.722928Z","iopub.status.busy":"2024-04-01T06:29:38.722207Z","iopub.status.idle":"2024-04-01T06:29:39.075423Z","shell.execute_reply":"2024-04-01T06:29:39.073987Z","shell.execute_reply.started":"2024-04-01T06:29:38.722582Z"},"trusted":true},"outputs":[{"data":{"text/plain":["Text(0.5, 1.0, 'Cross Validation Scores')"]},"execution_count":52,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n"," \"LogisticRegression\",\n"," \"KNeighborsClassifier\"]})\n","\n","g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n","g.set_xlabel(\"Mean Accuracy\")\n","g.set_title(\"Cross Validation Scores\")"]},{"cell_type":"markdown","metadata":{},"source":["## Ensemble Modeling (Assignment)"]},{"cell_type":"code","execution_count":53,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.078654Z","iopub.status.busy":"2024-04-01T06:29:39.077840Z","iopub.status.idle":"2024-04-01T06:29:39.862871Z","shell.execute_reply":"2024-04-01T06:29:39.860937Z","shell.execute_reply.started":"2024-04-01T06:29:39.078554Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["Accuracy of the voting classifier on the test set: 70.85%\n"]}],"source":["votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n"," (\"rfc\",best_estimators[2]),\n"," (\"lr\",best_estimators[3])],\n"," voting = \"soft\", n_jobs = -1)\n","votingC = votingC.fit(X_train, y_train)\n","\n","# Print the accuracy score of the voting classifier\n","acc_votingC = round(votingC.score(X_test, y_test) * 100, 2)\n","print(f\"Accuracy of the voting classifier on the test set: {acc_votingC}%\")"]},{"cell_type":"code","execution_count":56,"metadata":{},"outputs":[],"source":["# Drop the null values which are going to cause you an error in the next cell\n","# Drop rows with missing values in numeric_test\n","numeric_test_dropna = numeric_test.dropna()"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Prediction and Submission"]},{"cell_type":"code","execution_count":57,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.865981Z","iopub.status.busy":"2024-04-01T06:29:39.865330Z","iopub.status.idle":"2024-04-01T06:29:39.977357Z","shell.execute_reply":"2024-04-01T06:29:39.973301Z","shell.execute_reply.started":"2024-04-01T06:29:39.865906Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":[" PassengerId Survived\n","0 892 0.0\n","1 893 0.0\n","2 894 0.0\n","3 895 0.0\n","4 896 0.0\n",".. ... ...\n","413 1305 1.0\n","414 1306 0.0\n","415 1307 0.0\n","416 1308 0.0\n","417 1309 NaN\n","\n","[418 rows x 2 columns]\n"]}],"source":["test_survived = pd.Series(votingC.predict(numeric_test_dropna), name=\"Survived\").astype(int)\n","results = pd.concat([test_PassengerId, test_survived], axis=1)\n","results.to_csv(\"titanic.csv\", index=False)\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["# Congratulations on finishing the assignment!!\n","\n","### The submission is the titanic.csv which was just created, and this file which you have modified."]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"databundleVersionId":26502,"sourceId":3136,"sourceType":"competition"}],"dockerImageVersionId":29852,"isGpuEnabled":false,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.4"}},"nbformat":4,"nbformat_minor":4}
+{"cells":[{"cell_type":"markdown","metadata":{},"source":[" \n","# Ignore this"]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/fahee/anaconda3/lib/python3.11/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n"," from pandas.core import (\n"]}],"source":["import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import matplotlib.pyplot as plt\n","plt.style.use(\"seaborn-v0_8-whitegrid\")\n","\n","import seaborn as sns\n","\n","from collections import Counter\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")"]},{"cell_type":"markdown","metadata":{},"source":[" \n","# Load and Check Data"]},{"cell_type":"markdown","metadata":{},"source":["DataFrames hold the dataset in a tabular format for easy manipulation and analysis. \n","CSV data is read into 'df' using Pandas' read_csv() function."]},{"cell_type":"code","execution_count":2,"metadata":{"_kg_hide-input":true,"execution":{"iopub.execute_input":"2024-04-01T06:45:27.416192Z","iopub.status.busy":"2024-04-01T06:45:27.415763Z","iopub.status.idle":"2024-04-01T06:45:27.433162Z","shell.execute_reply":"2024-04-01T06:45:27.431944Z","shell.execute_reply.started":"2024-04-01T06:45:27.416105Z"},"trusted":true},"outputs":[],"source":["train_df = pd.read_csv(\"./data/train.csv\")"]},{"cell_type":"markdown","metadata":{},"source":["### 1. Try to read the test .csv file into test_df"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.436611Z","iopub.status.busy":"2024-04-01T06:45:27.435916Z","iopub.status.idle":"2024-04-01T06:45:27.449974Z","shell.execute_reply":"2024-04-01T06:45:27.448230Z","shell.execute_reply.started":"2024-04-01T06:45:27.436517Z"},"trusted":true},"outputs":[],"source":["test_df = pd.read_csv(\"./data/test.csv\")\n","test_PassengerId = test_df[\"PassengerId\"]"]},{"cell_type":"code","execution_count":4,"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","execution":{"iopub.execute_input":"2024-04-01T06:45:27.452397Z","iopub.status.busy":"2024-04-01T06:45:27.451949Z","iopub.status.idle":"2024-04-01T06:45:27.462622Z","shell.execute_reply":"2024-04-01T06:45:27.461859Z","shell.execute_reply.started":"2024-04-01T06:45:27.452348Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["The Columns of train_df are: \n"]},{"data":{"text/plain":["Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n"," 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n"," dtype='object')"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["print(\"The Columns of train_df are: \")\n","train_df.columns"]},{"cell_type":"markdown","metadata":{},"source":["### We can use head() to see the first few rows in the dataframe"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.464289Z","iopub.status.busy":"2024-04-01T06:45:27.463866Z","iopub.status.idle":"2024-04-01T06:45:27.491984Z","shell.execute_reply":"2024-04-01T06:45:27.491110Z","shell.execute_reply.started":"2024-04-01T06:45:27.464242Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked
\n","
\n"," \n"," \n","
\n","
0
\n","
1
\n","
0
\n","
3
\n","
Braund, Mr. Owen Harris
\n","
male
\n","
22.0
\n","
1
\n","
0
\n","
A/5 21171
\n","
7.2500
\n","
NaN
\n","
S
\n","
\n","
\n","
1
\n","
2
\n","
1
\n","
1
\n","
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n","
female
\n","
38.0
\n","
1
\n","
0
\n","
PC 17599
\n","
71.2833
\n","
C85
\n","
C
\n","
\n","
\n","
2
\n","
3
\n","
1
\n","
3
\n","
Heikkinen, Miss. Laina
\n","
female
\n","
26.0
\n","
0
\n","
0
\n","
STON/O2. 3101282
\n","
7.9250
\n","
NaN
\n","
S
\n","
\n","
\n","
3
\n","
4
\n","
1
\n","
1
\n","
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n","
female
\n","
35.0
\n","
1
\n","
0
\n","
113803
\n","
53.1000
\n","
C123
\n","
S
\n","
\n","
\n","
4
\n","
5
\n","
0
\n","
3
\n","
Allen, Mr. William Henry
\n","
male
\n","
35.0
\n","
0
\n","
0
\n","
373450
\n","
8.0500
\n","
NaN
\n","
S
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked \n","0 0 A/5 21171 7.2500 NaN S \n","1 0 PC 17599 71.2833 C85 C \n","2 0 STON/O2. 3101282 7.9250 NaN S \n","3 0 113803 53.1000 C123 S \n","4 0 373450 8.0500 NaN S "]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["train_df.head()"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.494326Z","iopub.status.busy":"2024-04-01T06:45:27.493637Z","iopub.status.idle":"2024-04-01T06:45:27.541999Z","shell.execute_reply":"2024-04-01T06:45:27.541210Z","shell.execute_reply.started":"2024-04-01T06:45:27.494251Z"},"jupyter":{"source_hidden":true},"trusted":true},"outputs":[{"data":{"text/html":["
"],"text/plain":[" PassengerId Pclass Name Sex \\\n","0 892 3 Kelly, Mr. James male \n","1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n","2 894 2 Myles, Mr. Thomas Francis male \n","3 895 3 Wirz, Mr. Albert male \n","4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n","\n"," Age SibSp Parch Ticket Fare Cabin Embarked \n","0 34.5 0 0 330911 7.8292 NaN Q \n","1 47.0 1 0 363272 7.0000 NaN S \n","2 62.0 0 0 240276 9.6875 NaN Q \n","3 27.0 0 0 315154 8.6625 NaN S \n","4 22.0 1 1 3101298 12.2875 NaN S "]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["### 3. Now try checking for a description of test_df's data"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/html":["
Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton )
\n","\n"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","RangeIndex: 891 entries, 0 to 890\n","Data columns (total 12 columns):\n"," # Column Non-Null Count Dtype \n","--- ------ -------------- ----- \n"," 0 PassengerId 891 non-null int64 \n"," 1 Survived 891 non-null int64 \n"," 2 Pclass 891 non-null int64 \n"," 3 Name 891 non-null object \n"," 4 Sex 891 non-null object \n"," 5 Age 714 non-null float64\n"," 6 SibSp 891 non-null int64 \n"," 7 Parch 891 non-null int64 \n"," 8 Ticket 891 non-null object \n"," 9 Fare 891 non-null float64\n"," 10 Cabin 204 non-null object \n"," 11 Embarked 889 non-null object \n","dtypes: float64(2), int64(5), object(5)\n","memory usage: 83.7+ KB\n"]}],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":10,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[{"data":{"text/plain":["PassengerId 3\n","Survived 1\n","Pclass 3\n","Name Heikkinen, Miss. Laina\n","Sex female\n","Age 26.0\n","SibSp 0\n","Parch 0\n","Ticket STON/O2. 3101282\n","Fare 7.925\n","Cabin NaN\n","Embarked S\n","Name: 2, dtype: object"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"data":{"text/plain":["PassengerId 6\n","Survived 0\n","Pclass 3\n","Name Moran, Mr. James\n","Sex male\n","Age NaN\n","SibSp 0\n","Parch 0\n","Ticket 330877\n","Fare 8.4583\n","Cabin NaN\n","Embarked Q\n","Name: 5, dtype: object"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["# Print the 5th Row\n","train_df.iloc[5]"]},{"cell_type":"code","execution_count":12,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 male\n","1 female\n","2 female\n","3 female\n","4 male\n"," ... \n","886 male\n","887 female\n","888 female\n","889 male\n","890 male\n","Name: Sex, Length: 891, dtype: object"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":13,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 Braund, Mr. Owen Harris\n","1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n","2 Heikkinen, Miss. Laina\n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n","4 Allen, Mr. William Henry\n"," ... \n","886 Montvila, Rev. Juozas\n","887 Graham, Miss. Margaret Edith\n","888 Johnston, Miss. Catherine Helen \"Carrie\"\n","889 Behr, Mr. Karl Howell\n","890 Dooley, Mr. Patrick\n","Name: Name, Length: 891, dtype: object"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["# Print the Name Column\n","train_df['Name']"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":14,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","sns.countplot(x='Pclass', data=train_df)\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":16,"metadata":{"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","sns.countplot(x='Embarked', data=train_df)\n","plt.title('Count of Passengers by Embarkation Point')\n","plt.xlabel('Embarkation Point')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['C', 'Q', 'S'])\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["plt.figure(figsize=(8, 6))\n","sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":19,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["# Create the plot below\n","plt.figure(figsize=(8, 6))\n","sns.histplot(train_df['Age'], bins=20, color='green')\n","plt.title('Distribution of Age')\n","plt.xlabel('Age')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":[" \n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":20,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked
\n","
\n"," \n"," \n","
\n","
5
\n","
6
\n","
0
\n","
3
\n","
Moran, Mr. James
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
330877
\n","
8.4583
\n","
NaN
\n","
Q
\n","
\n","
\n","
17
\n","
18
\n","
1
\n","
2
\n","
Williams, Mr. Charles Eugene
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
244373
\n","
13.0000
\n","
NaN
\n","
S
\n","
\n","
\n","
19
\n","
20
\n","
1
\n","
3
\n","
Masselmani, Mrs. Fatima
\n","
female
\n","
NaN
\n","
0
\n","
0
\n","
2649
\n","
7.2250
\n","
NaN
\n","
C
\n","
\n","
\n","
26
\n","
27
\n","
0
\n","
3
\n","
Emir, Mr. Farred Chehab
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
2631
\n","
7.2250
\n","
NaN
\n","
C
\n","
\n","
\n","
28
\n","
29
\n","
1
\n","
3
\n","
O'Dwyer, Miss. Ellen \"Nellie\"
\n","
female
\n","
NaN
\n","
0
\n","
0
\n","
330959
\n","
7.8792
\n","
NaN
\n","
Q
\n","
\n","
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
...
\n","
\n","
\n","
859
\n","
860
\n","
0
\n","
3
\n","
Razi, Mr. Raihed
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
2629
\n","
7.2292
\n","
NaN
\n","
C
\n","
\n","
\n","
863
\n","
864
\n","
0
\n","
3
\n","
Sage, Miss. Dorothy Edith \"Dolly\"
\n","
female
\n","
NaN
\n","
8
\n","
2
\n","
CA. 2343
\n","
69.5500
\n","
NaN
\n","
S
\n","
\n","
\n","
868
\n","
869
\n","
0
\n","
3
\n","
van Melkebeke, Mr. Philemon
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
345777
\n","
9.5000
\n","
NaN
\n","
S
\n","
\n","
\n","
878
\n","
879
\n","
0
\n","
3
\n","
Laleff, Mr. Kristo
\n","
male
\n","
NaN
\n","
0
\n","
0
\n","
349217
\n","
7.8958
\n","
NaN
\n","
S
\n","
\n","
\n","
888
\n","
889
\n","
0
\n","
3
\n","
Johnston, Miss. Catherine Helen \"Carrie\"
\n","
female
\n","
NaN
\n","
1
\n","
2
\n","
W./C. 6607
\n","
23.4500
\n","
NaN
\n","
S
\n","
\n"," \n","
\n","
177 rows × 12 columns
\n","
"],"text/plain":[" PassengerId Survived Pclass Name \\\n","5 6 0 3 Moran, Mr. James \n","17 18 1 2 Williams, Mr. Charles Eugene \n","19 20 1 3 Masselmani, Mrs. Fatima \n","26 27 0 3 Emir, Mr. Farred Chehab \n","28 29 1 3 O'Dwyer, Miss. Ellen \"Nellie\" \n",".. ... ... ... ... \n","859 860 0 3 Razi, Mr. Raihed \n","863 864 0 3 Sage, Miss. Dorothy Edith \"Dolly\" \n","868 869 0 3 van Melkebeke, Mr. Philemon \n","878 879 0 3 Laleff, Mr. Kristo \n","888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n","\n"," Sex Age SibSp Parch Ticket Fare Cabin Embarked \n","5 male NaN 0 0 330877 8.4583 NaN Q \n","17 male NaN 0 0 244373 13.0000 NaN S \n","19 female NaN 0 0 2649 7.2250 NaN C \n","26 male NaN 0 0 2631 7.2250 NaN C \n","28 female NaN 0 0 330959 7.8792 NaN Q \n",".. ... ... ... ... ... ... ... ... \n","859 male NaN 0 0 2629 7.2292 NaN C \n","863 female NaN 8 2 CA. 2343 69.5500 NaN S \n","868 male NaN 0 0 345777 9.5000 NaN S \n","878 male NaN 0 0 349217 7.8958 NaN S \n","888 female NaN 1 2 W./C. 6607 23.4500 NaN S \n","\n","[177 rows x 12 columns]"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[{"data":{"text/plain":["PassengerId 0\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 86\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 1\n","Cabin 327\n","Embarked 0\n","dtype: int64"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":22,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":23,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[{"data":{"text/plain":[""]},"execution_count":23,"metadata":{},"output_type":"execute_result"},{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":24,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 S\n","1 C\n","2 S\n","3 S\n","4 S\n","Name: Embarked, dtype: object"]},"execution_count":24,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":25,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[{"data":{"image/png":"","text/plain":["
"]},"metadata":{},"output_type":"display_data"}],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":26,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Survived
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked_C
\n","
Embarked_Q
\n","
Embarked_S
\n","
\n"," \n"," \n","
\n","
0
\n","
1
\n","
0
\n","
3
\n","
Braund, Mr. Owen Harris
\n","
male
\n","
22.0
\n","
1
\n","
0
\n","
A/5 21171
\n","
7.2500
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
1
\n","
2
\n","
1
\n","
1
\n","
Cumings, Mrs. John Bradley (Florence Briggs Th...
\n","
female
\n","
38.0
\n","
1
\n","
0
\n","
PC 17599
\n","
71.2833
\n","
C85
\n","
True
\n","
False
\n","
False
\n","
\n","
\n","
2
\n","
3
\n","
1
\n","
3
\n","
Heikkinen, Miss. Laina
\n","
female
\n","
26.0
\n","
0
\n","
0
\n","
STON/O2. 3101282
\n","
7.9250
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
3
\n","
4
\n","
1
\n","
1
\n","
Futrelle, Mrs. Jacques Heath (Lily May Peel)
\n","
female
\n","
35.0
\n","
1
\n","
0
\n","
113803
\n","
53.1000
\n","
C123
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
4
\n","
5
\n","
0
\n","
3
\n","
Allen, Mr. William Henry
\n","
male
\n","
35.0
\n","
0
\n","
0
\n","
373450
\n","
8.0500
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Survived Pclass \\\n","0 1 0 3 \n","1 2 1 1 \n","2 3 1 3 \n","3 4 1 1 \n","4 5 0 3 \n","\n"," Name Sex Age SibSp \\\n","0 Braund, Mr. Owen Harris male 22.0 1 \n","1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n","2 Heikkinen, Miss. Laina female 26.0 0 \n","3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n","4 Allen, Mr. William Henry male 35.0 0 \n","\n"," Parch Ticket Fare Cabin Embarked_C Embarked_Q Embarked_S \n","0 0 A/5 21171 7.2500 NaN False False True \n","1 0 PC 17599 71.2833 C85 True False False \n","2 0 STON/O2. 3101282 7.9250 NaN False False True \n","3 0 113803 53.1000 C123 False False True \n","4 0 373450 8.0500 NaN False False True "]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n","
\n","
\n","
PassengerId
\n","
Pclass
\n","
Name
\n","
Sex
\n","
Age
\n","
SibSp
\n","
Parch
\n","
Ticket
\n","
Fare
\n","
Cabin
\n","
Embarked_C
\n","
Embarked_Q
\n","
Embarked_S
\n","
\n"," \n"," \n","
\n","
0
\n","
892
\n","
3
\n","
Kelly, Mr. James
\n","
male
\n","
34.5
\n","
0
\n","
0
\n","
330911
\n","
7.8292
\n","
NaN
\n","
False
\n","
True
\n","
False
\n","
\n","
\n","
1
\n","
893
\n","
3
\n","
Wilkes, Mrs. James (Ellen Needs)
\n","
female
\n","
47.0
\n","
1
\n","
0
\n","
363272
\n","
7.0000
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
2
\n","
894
\n","
2
\n","
Myles, Mr. Thomas Francis
\n","
male
\n","
62.0
\n","
0
\n","
0
\n","
240276
\n","
9.6875
\n","
NaN
\n","
False
\n","
True
\n","
False
\n","
\n","
\n","
3
\n","
895
\n","
3
\n","
Wirz, Mr. Albert
\n","
male
\n","
27.0
\n","
0
\n","
0
\n","
315154
\n","
8.6625
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n","
\n","
4
\n","
896
\n","
3
\n","
Hirvonen, Mrs. Alexander (Helga E Lindqvist)
\n","
female
\n","
22.0
\n","
1
\n","
1
\n","
3101298
\n","
12.2875
\n","
NaN
\n","
False
\n","
False
\n","
True
\n","
\n"," \n","
\n","
"],"text/plain":[" PassengerId Pclass Name Sex \\\n","0 892 3 Kelly, Mr. James male \n","1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n","2 894 2 Myles, Mr. Thomas Francis male \n","3 895 3 Wirz, Mr. Albert male \n","4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n","\n"," Age SibSp Parch Ticket Fare Cabin Embarked_C Embarked_Q \\\n","0 34.5 0 0 330911 7.8292 NaN False True \n","1 47.0 1 0 363272 7.0000 NaN False False \n","2 62.0 0 0 240276 9.6875 NaN False True \n","3 27.0 0 0 315154 8.6625 NaN False False \n","4 22.0 1 1 3101298 12.2875 NaN False False \n","\n"," Embarked_S \n","0 False \n","1 True \n","2 False \n","3 True \n","4 True "]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":28,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 A/5 21171\n","1 PC 17599\n","2 STON/O2. 3101282\n","3 113803\n","4 373450\n","5 330877\n","6 17463\n","7 349909\n","8 347742\n","9 237736\n","10 PP 9549\n","11 113783\n","12 A/5. 2151\n","13 347082\n","14 350406\n","15 248706\n","16 382652\n","17 244373\n","18 345763\n","19 2649\n","Name: Ticket, dtype: object"]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":29,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[{"data":{"text/plain":["'A5'"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":30,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set\n","tickets = []\n","for i in list(test_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","test_df[\"Ticket\"] = tickets"]},{"cell_type":"code","execution_count":31,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[{"data":{"text/plain":["0 A5\n","1 PC\n","2 STONO2\n","3 x\n","4 x\n","5 x\n","6 x\n","7 x\n","8 x\n","9 x\n","10 PP\n","11 x\n","12 A5\n","13 x\n","14 x\n","15 x\n","16 x\n","17 x\n","18 x\n","19 x\n","Name: Ticket, dtype: object"]},"execution_count":31,"metadata":{},"output_type":"execute_result"}],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":32,"metadata":{},"outputs":[{"data":{"text/plain":["0 x\n","1 x\n","2 x\n","3 x\n","4 x\n","5 x\n","6 x\n","7 x\n","8 x\n","9 A4\n","10 x\n","11 x\n","12 x\n","13 x\n","14 WEP\n","15 SCPARIS\n","16 x\n","17 x\n","18 STONO2\n","19 x\n","Name: Ticket, dtype: object"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":33,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[{"data":{"text/html":["