In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml

titanic = fetch_openml(name='Titanic', version=1)
titanic = pd.concat([titanic['data'], titanic['target']], axis=1)

  warn(


**Problem 1.⭐⭐**
The goal of this and the following problems is to eventually build an algorithm that can *predict* whether a person with given characteristics would have survived the sinking of the Titanic or not.

This question requires you to complete the function ``function1`` so that:

* only the data of **pclass**, **sex**, **age**, **survived** columns remain from the total **titanic** data set,
* Make the **sex** column name **male** and contain numeric data (female -> 0, male -> 1). See the [OrdinalEncoder] (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder) class.
* split the received data into train (80%), test (20%) parts after mixing using the sklearn library [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function with ``random_state=0`` parameter. As a result of the operation of the function, 4 data sets should be obtained
* X_train (pandas.DataFrame)
* X_test (pandas.DataFrame)
* y_train (pandas.Series)
* y_test (pandas.Series)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

def function1(data):
  columns_to_keep = ['pclass', 'sex', 'age', 'survived']
  data.drop(columns=[col for col in data.columns if col not in columns_to_keep], inplace=True)
  encoded_col = OrdinalEncoder().fit_transform(data['sex'].values.reshape(-1, 1))
  data = data.assign(sex=encoded_col).rename(columns={'sex': 'male'})
  X_train, X_test, y_train, y_test = train_test_split(data[['pclass', 'male', 'age']], data['survived'], random_state=42, test_size=0.20, shuffle=True)
  return (X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = function1(data=titanic)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1047, 3)
(262, 3)
(1047,)
(262,)


**Problem 2.⭐⭐** If you noticed (if not, notice 👀) there are missing values in the data obtained in the previous question.

This question asks you to write a function named ``function2`` that will fill in those missing data using [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn .impute.SimpleImputer) class, and the strategy variable will be given by ``function2`` function.

As a result of the operation of this function, copies of data sets with missing values from the previous request should be created.

In [None]:
from sklearn.impute import SimpleImputer

def function2(X_train, X_test, y_train, y_test, strategy):
  simple_imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)
  simple_imputer.fit(X_train)
  X_train_ = pd.DataFrame(simple_imputer.transform(X_train), columns=X_train.columns, index=X_train.index)
  X_test_ = pd.DataFrame(simple_imputer.transform(X_test), columns=X_test.columns, index=X_test.index)
  y_train_ = y_train.copy()
  y_test_ = y_test.copy()
  return (X_train_, X_test_, y_train_, y_test_)

In [None]:
X_train_, X_test_, y_train_, y_test_ = function2(X_train, X_test, y_train, y_test,
                                             strategy='mean')
print(np.isnan(X_train_).sum())
print(np.isnan(X_test_).sum())

pclass    0
male      0
age       0
dtype: int64
pclass    0
male      0
age       0
dtype: int64


**Problem 3.⭐⭐⭐** This request asks to create a multi-class classification model (as ``class``) named **BasicLinearClassifier** that works as follows:

* during training, a **representative-vector** of that class is calculated for each class (class/label) on the set of training data (training data)

* during prediction (prediction/inference), the distance of each test data to representative-vectors of all classes is calculated and the class whose representative-vector is closest to the test data is selected as a prediction

* algorithm must have 2 hyper-parameters
   * **class_representative**. can accept the *mean* or *median* value, depending on which the representative-vector calculation procedure will be determined, that is, in the case of mean, the representative-vector of each class will be the average of the data belonging to that class, and in the case of median, the median vector.
   * **distance**: can accept *L1* or *L2*, depending on which the distance between the arbitrary data and the representative-vector will be defined; In the case of L1, according to the L1 norm, and in the case of L2, according to the L2 norm.

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class BasicLinearClassifier(ClassifierMixin, BaseEstimator):
  """
  Basic Linear Classifier
  for multi-class classification case.
  """
  def __init__(self, class_representative='mean', distance='L2'):
      self.class_representative = class_representative
      self.distance = distance

  def fit(self, X, y):
    """
    Fits the model on given data, which in this case means
    calculating the class means/medians and storing them
    for later usage in 'predict' method.

    :param numpy.ndarray X: input data of size (nr_samples, nr_features)
    :param numpy.ndarray y: labels of size (nr_data_points,)
    """
    X, y = check_X_y(X, y)
    self.classes_ = unique_labels(y)
    self.class_reprs_ = {}
    for class_ in self.classes_:
        X_class = X[y == class_]
        if self.class_representative == 'mean':
            class_repr = np.apply_along_axis(np.mean, 0, X_class)
        elif self.class_representative == 'median':
            class_repr = np.apply_along_axis(np.median, 0, X_class)
        else:
            raise ValueError('Invalid value for class representative')
        self.class_reprs_[class_] = class_repr
    return self

  def predict(self, X):
    """
    Performs predictions on the given data points
    using the estimated class means/medians.

    :param numpy.ndarray X: input data for testing (nr_samples, nr_features)
    :returns: labels of size (nr_samples,)
    :rtype: numpy.ndarray or pandas.Series
    """
    check_is_fitted(self)
    X = check_array(X)
    y_pred = np.empty(X.shape[0], dtype=str)

    for i in range(X.shape[0]):
      dists = []
      for class_, class_vector in self.class_reprs_.items():
        dist = self.calculate_distance(X[i], class_vector)
        dists.append(dist)
      y_pred[i] = self.classes_[np.argmin(dists)]
    return y_pred

  def calculate_distance(self, a, b):
    if self.distance == "L1":
        return np.sum(np.abs(a - b))
    elif self.distance == "L2":
        return np.linalg.norm(a - b)
    else:
        raise ValueError('Invalid distance type')

  def score(self, X, y):
    y_pred = self.predict(X)
    return np.mean(y_pred == y)

In [None]:
from sklearn.utils.estimator_checks import check_estimator
model = BasicLinearClassifier()
# check_estimator(model)
model.fit(X_train_, y_train_)
model.score(X_test_, y_test_)

0.5687022900763359

**Problem 4.⭐⭐⭐** This request requires a function named ``function4`` to be built inside which a sklearn Pipeline consisting of SimpleImputer and BasicLinearClassifier will be built and GridSearch (cv=5) will be performed on the following parameters:

* SimpleImputer:
   * strategy = ['mean', 'median', 'most_frequent']
* BasicLinearClassifier
   * class_representative = ['mean', 'median']
   * distance = ['L1', 'L2'].

The function should return the cv_results_ attribute of the object created from GridSearchCV as a pandas.DataFrame. This function should be given the training data obtained as a result of Problem 1.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

def function4(X_train, y_train):
  pipe = make_pipeline(SimpleImputer(), BasicLinearClassifier())
  params = {'simpleimputer__strategy': ['mean', 'median', 'most_frequent'],
            'basiclinearclassifier__class_representative': ['mean', 'median'],
            'basiclinearclassifier__distance': ['L1', 'L2']}
  grid = GridSearchCV(pipe,
                      param_grid=params)
  grid.fit(X_train, y_train)
  df = pd.DataFrame(grid.cv_results_)
  return df

In [None]:
function4(X_train, y_train).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_basiclinearclassifier__class_representative,param_basiclinearclassifier__distance,param_simpleimputer__strategy,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005904,0.000961,0.007196,0.001058,mean,L1,mean,{'basiclinearclassifier__class_representative'...,0.766667,0.504762,0.511962,0.535885,0.755981,0.615051,0.119922,7
1,0.005176,0.00052,0.005758,0.00013,mean,L1,median,{'basiclinearclassifier__class_representative'...,0.757143,0.390476,0.45933,0.406699,0.727273,0.548184,0.160326,8
2,0.005095,6.4e-05,0.005956,0.000513,mean,L1,most_frequent,{'basiclinearclassifier__class_representative'...,0.647619,0.409524,0.454545,0.430622,0.736842,0.53583,0.131554,10
3,0.004561,0.000109,0.00557,8e-05,mean,L2,mean,{'basiclinearclassifier__class_representative'...,0.590476,0.504762,0.507177,0.535885,0.593301,0.54632,0.038795,9
4,0.00524,0.000276,0.006527,0.001537,mean,L2,median,{'basiclinearclassifier__class_representative'...,0.590476,0.390476,0.45933,0.411483,0.497608,0.469875,0.070907,11
