In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [2]:
'''
可调参数
'''

# KNNImputer
n_neighbors = 5

# 数据预处理

In [3]:
train_DataFrame = pd.read_csv(filepath_or_buffer="Data/Titanic/train.csv", 
                    names=["PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", 
                    "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], skiprows=1)
test_DataFrame = pd.read_csv(filepath_or_buffer="Data/Titanic/test.csv", 
                    names=["PassengerId", "Pclass", "Name", "Sex", "Age", 
                    "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"], skiprows=1)
test_AnswerDataFrame = pd.read_csv(filepath_or_buffer="Data/Titanic/gender_submission.csv",
                    names=["PassengerId", "Survived"], skiprows=1)

print(train_DataFrame)
print(test_DataFrame)
print(test_AnswerDataFrame)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [4]:
def EncodeData(x: pd.DataFrame) -> list:
    """
    对特征向量进行编码
    将pandas的DataFrame结构转换为list
    同时去除无关列
    :param x: pandas.DataFrame
    :return: 特征向量
    """
    input_data = [[x["Pclass"][i], x["Sex"][i], x["Age"][i], x["SibSp"][i], x["Parch"][i], x["Fare"][i], x["Embarked"][i]] for i in range(len(x))]
    
    return input_data

class PreEncodeData:
    def __init__(self, x: pd.DataFrame) -> pd.DataFrame:
        """
        对输入数据进行预编码
        :param x: pandas.DataFrame 类型数据，为读取的数据
        :return: pandas.DataFrame 编码后的数据
        """
        for i in range(len(x["Sex"])):
            if x["Sex"][i] == 'male':
                x.loc[i, "Sex"] = 1
            elif x["Sex"][i] == 'female':
                x.loc[i, "Sex"] = 0
        
        for i in range(len(x["Embarked"])):
            if x["Embarked"][i] == 'C':
                x.loc[i, "Embarked"] = 0
            elif x["Embarked"][i] == 'Q':
                x.loc[i, "Embarked"] = 1
            elif x["Embarked"][i] == 'S':
                x.loc[i, "Embarked"] = 2
        x = x.drop(columns=["Name", "Ticket", "Cabin"])
        self.MinMax = MinMaxScalerStrategy(x)

    
    def PreEncode(self, x: pd.DataFrame) -> pd.DataFrame:
        """
        对再次输入数据进行预编码
        如将测试集数据进行与训练集相同的编码方式

        :param x: pandas.DataFrame 类型数据，为读取的数据
        :return: pandas.DataFrame 编码后的数据
        """
        for i in range(len(x["Sex"])):
            if x["Sex"][i] == 'male':
                x.loc[i, "Sex"] = 1
            elif x["Sex"][i] == 'female':
                x.loc[i, "Sex"] = 0
        
        for i in range(len(x["Embarked"])):
            if x["Embarked"][i] == 'C':
                x.loc[i, "Embarked"] = 0
            elif x["Embarked"][i] == 'Q':
                x.loc[i, "Embarked"] = 1
            elif x["Embarked"][i] == 'S':
                x.loc[i, "Embarked"] = 2
        x = x.drop(columns=["Name", "Ticket", "Cabin"])
        x = self.MinMax.transform(x)
        return x

def EncodeTrainData(x: pd.DataFrame) -> tuple[list, list]:
    """
    对测试集进行编码处理
    将pandas的DataFrame结构转换为list
    同时去除无关列
    :param x: pandas.DataFrame
    :return: 特征向量，标签向量
    """
    input_data = EncodeData(x)
    label = [x["Survived"][i] for i in range(len(x))]

    return input_data, label

In [5]:
class MinMaxScalerStrategy:
    def __init__(self, data: pd.DataFrame):
        self.data = data
        self.min = {
            "Pclass": data["Pclass"].min(),
            "Sex": data["Sex"].min(),
            "Age": data["Age"].min(),
            "SibSp": data["SibSp"].min(),
            "Parch": data["Parch"].min(),
            "Fare": data["Fare"].min(),
            "Embarked": data["Embarked"].min()
        }
        self.max = {
            "Pclass": data["Pclass"].max(),
            "Sex": data["Sex"].max(),
            "Age": data["Age"].max(),
            "SibSp": data["SibSp"].max(),
            "Parch": data["Parch"].max(),
            "Fare": data["Fare"].max(),
            "Embarked": data["Embarked"].max()
        }
        def inner_transform(data: pd.DataFrame) -> pd.DataFrame:
            colunm_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
            for name in colunm_names:
                for i in range(len(data[name])):
                    data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
            return data
        self.data = inner_transform(self.data)

    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        colunm_names = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
        for name in colunm_names:
            for i in range(len(data[name])):
                data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
        return data

## 缺省值处理

In [6]:
imputer = KNNImputer(n_neighbors=n_neighbors, weights="distance", copy=False)

# 训练集编码
EncodeModel = PreEncodeData(train_DataFrame)

print("train_DataFrame:")
train_DataFrame = EncodeModel.PreEncode(train_DataFrame)
print(train_DataFrame)

# 测试集编码
print("test_DataFrame:")
test_DataFrame = EncodeModel.PreEncode(test_DataFrame)
print(test_DataFrame)

# 缺失值填充
print("train_DataFrame_filled:")
train_DataFrame_filled = pd.DataFrame(imputer.fit_transform(train_DataFrame), columns=train_DataFrame.columns)
print(train_DataFrame_filled)

print("test_DataFrame_filled:")
test_DataFrame_filled = pd.DataFrame(imputer.fit_transform(test_DataFrame), columns=test_DataFrame.columns)
print(test_DataFrame_filled)

  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])


train_DataFrame:


  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])


     PassengerId  Survived  Pclass  Sex       Age  SibSp     Parch      Fare  \
0              1         0     1.0  1.0  0.271174  0.125  0.000000  0.014151   
1              2         1     0.0  0.0  0.472229  0.125  0.000000  0.139136   
2              3         1     1.0  0.0  0.321438  0.000  0.000000  0.015469   
3              4         1     0.0  0.0  0.434531  0.125  0.000000  0.103644   
4              5         0     1.0  1.0  0.434531  0.000  0.000000  0.015713   
..           ...       ...     ...  ...       ...    ...       ...       ...   
886          887         0     0.5  1.0  0.334004  0.000  0.000000  0.025374   
887          888         1     0.0  0.0  0.233476  0.000  0.000000  0.058556   
888          889         0     1.0  0.0       NaN  0.125  0.333333  0.045771   
889          890         1     0.0  1.0  0.321438  0.000  0.000000  0.058556   
890          891         0     1.0  1.0  0.396833  0.000  0.000000  0.015127   

    Embarked  
0        1.0  
1        

  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])
  data.loc[i, name] = (data[name][i] - self.min[name]) / (self.max[name] - self.min[name])


## 训练数据集

In [8]:
print("train_dataset:")
train_dataset = EncodeData(train_DataFrame_filled)
print(train_dataset)

print("test_dataset:")
test_dataset = EncodeData(test_DataFrame_filled)
print(test_dataset)

train_label = [train_DataFrame_filled["Survived"][i] for i in range(len(train_DataFrame_filled))]
test_label = [test_AnswerDataFrame["Survived"][i] for i in range(len(test_AnswerDataFrame))]

train_dataset:
[[1.0, 1.0, 0.2711736617240513, 0.125, 0.0, 0.014151057562208049, 1.0], [0.0, 0.0, 0.4722292033174164, 0.125, 0.0, 0.13913573538264068, 0.0], [1.0, 0.0, 0.32143754712239253, 0.0, 0.0, 0.015468569817999833, 1.0], [0.0, 0.0, 0.43453128926866047, 0.125, 0.0, 0.10364429745562033, 1.0], [1.0, 1.0, 0.43453128926866047, 0.0, 0.0, 0.015712553569072387, 1.0], [1.0, 1.0, 0.4078862630501446, 0.0, 0.0, 0.01650950209357577, 0.5], [0.0, 1.0, 0.6732847449107816, 0.0, 0.0, 0.10122885832000206, 1.0], [1.0, 1.0, 0.01985423473234481, 0.375, 0.16666666666666666, 0.04113566043083236, 1.0], [1.0, 0.0, 0.33400351847197784, 0.0, 0.3333333333333333, 0.021730754366528396, 1.0], [0.5, 0.0, 0.17064589092736868, 0.125, 0.0, 0.058694292654020104, 0.0], [1.0, 0.0, 0.04498617743151546, 0.125, 0.16666666666666666, 0.03259622914329302, 1.0], [0.0, 0.0, 0.7235486303091229, 0.0, 0.0, 0.051822148727810165, 1.0], [1.0, 1.0, 0.24604171902488062, 0.0, 0.0, 0.015712553569072387, 1.0], [1.0, 1.0, 0.4847951746670

# Support Vector Machine

## 降维分析