In [218]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', None)

## 在这里定义一些必要的函数:heart_eyes:

In [219]:
# Defining some important function which will be used for the analysis of data
def inspect_columns(df):
    """A helper function that does a better job than df.info() and df.describe()"""
    
    total_rows = len(df)
    result = pd.DataFrame({
        'total_rows': [total_rows] * df.shape[1],
        'rows_with_missing_values': df.isnull().sum(),
        'unique': df.nunique() == total_rows,
        'cardinality': df.nunique(),
        'with_null': df.isna().any(),
        'null_pct': round((df.isnull().sum() / total_rows) * 100, 2),
        '1st_row': df.iloc[0],
        'random_row': df.iloc[np.random.randint(low=0, high=total_rows)],
        'last_row': df.iloc[-1],
        'dtype': df.dtypes,

    })
    
    return result

def categorize_columns(train, target_variable):
    numerical_data_types = ['int64', 'float64', 'datetime64[ns]']  # adjusted data types
    categorical_data_types = ['object']

    numerical_columns = [column for column in train.columns if train[column].dtype in numerical_data_types]
    print(f"Numerical variables ({len(numerical_columns)}): {numerical_columns}")

    categorical_columns = [column for column in train.columns if train[column].dtype in categorical_data_types]
    
    if target_variable in categorical_columns:
        categorical_columns.remove(target_variable)
    print(f"Categorical variables ({len(categorical_columns)}): {categorical_columns}")

    return numerical_columns, categorical_columns

# 读取数据

In [220]:
path = '../../data/titanic'
print(os.path.abspath(path))
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

e:\量化\MyKaggle\data\titanic
../../data/titanic\gender_submission.csv
../../data/titanic\test.csv
../../data/titanic\train.csv


In [221]:
train_data_path = os.path.join(path, 'train.csv')
test_data_path = os.path.join(path, 'test.csv')
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
train_data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## 读完数据后看看整体的数据信息:kiss:

In [222]:
print('checking the data health of the train data')
display(inspect_columns(train_data))

checking the data health of the train data


Unnamed: 0,total_rows,rows_with_missing_values,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
PassengerId,891,0,True,891,False,0.0,1,781,891,int64
Survived,891,0,False,2,False,0.0,0,1,0,int64
Pclass,891,0,False,3,False,0.0,3,3,3,int64
Name,891,0,True,891,False,0.0,"Braund, Mr. Owen Harris","Ayoub, Miss. Banoura","Dooley, Mr. Patrick",object
Sex,891,0,False,2,False,0.0,male,female,male,object
Age,891,177,False,88,True,19.87,22.0,13.0,32.0,float64
SibSp,891,0,False,7,False,0.0,1,0,0,int64
Parch,891,0,False,7,False,0.0,0,0,0,int64
Ticket,891,0,False,681,False,0.0,A/5 21171,2687,370376,object
Fare,891,0,False,248,False,0.0,7.25,7.2292,7.75,float64


In [223]:
print('checking the data health of the test data')
display(inspect_columns(test_data))

checking the data health of the test data


Unnamed: 0,total_rows,rows_with_missing_values,unique,cardinality,with_null,null_pct,1st_row,random_row,last_row,dtype
PassengerId,418,0,True,418,False,0.0,892,1221,1309,int64
Pclass,418,0,False,3,False,0.0,3,2,3,int64
Name,418,0,True,418,False,0.0,"Kelly, Mr. James","Enander, Mr. Ingvar","Peter, Master. Michael J",object
Sex,418,0,False,2,False,0.0,male,male,male,object
Age,418,86,False,79,True,20.57,34.5,21.0,,float64
SibSp,418,0,False,7,False,0.0,0,0,1,int64
Parch,418,0,False,8,False,0.0,0,0,1,int64
Ticket,418,0,False,363,False,0.0,330911,236854,2668,object
Fare,418,1,False,169,True,0.24,7.8292,13.0,22.3583,float64
Cabin,418,327,False,76,True,78.23,,,,object


由于存货与否是01哑变量，所以求和/计数就是为1的比率:smile:

In [224]:
women = train_data.loc[train_data.Sex == 'female']['Survived']
women_rate = sum(women)/len(women)
print('proportion of women who survived:',women_rate)
print(len(women))

proportion of women who survived: 0.7420382165605095
314


In [225]:
men = train_data.loc[train_data.Sex == 'male']['Survived']
men_rate = sum(men)/len(men)
print('proportion of men who survived:',men_rate)
print(len(men))

proportion of men who survived: 0.18890814558058924
577


In [226]:
train_data.SibSp.unique()

array([1, 0, 3, 4, 2, 5, 8], dtype=int64)

In [227]:
train_data.Parch.unique()

array([0, 1, 2, 5, 3, 4, 6], dtype=int64)

In [228]:
train_data.Embarked.unique()[3]

nan

In [229]:
train_data.Pclass.unique()

array([3, 1, 2], dtype=int64)

In [230]:
for i in train_data.Pclass.unique():
    Pclass = train_data.loc[train_data.Pclass == i]['Survived']
    Pclass_rate = sum(Pclass)/len(Pclass)
    print('number of Pclass {} :'.format(i), len(Pclass))
    print(f'proportion of Pclass {i} who survived:',Pclass_rate)
    print('-'*20)

number of Pclass 3 : 491
proportion of Pclass 3 who survived: 0.24236252545824846
--------------------
number of Pclass 1 : 216
proportion of Pclass 1 who survived: 0.6296296296296297
--------------------
number of Pclass 2 : 184
proportion of Pclass 2 who survived: 0.47282608695652173
--------------------


In [231]:
o = 0
for i in train_data.Embarked.unique():
    if o>2:break
    Embarked = train_data.loc[train_data.Embarked == i]['Survived']
    Embarked_rate = sum(Embarked)/len(Embarked)
    print('number of people who embarked at',i,':',len(Embarked))
    print(f'proportion of people who embarked at {i} and survived:',Embarked_rate)
    print('-'*20)
    o += 1

number of people who embarked at S : 644
proportion of people who embarked at S and survived: 0.33695652173913043
--------------------
number of people who embarked at C : 168
proportion of people who embarked at C and survived: 0.5535714285714286
--------------------
number of people who embarked at Q : 77
proportion of people who embarked at Q and survived: 0.38961038961038963
--------------------


In [232]:
for i in train_data.Parch.unique():
    parch = train_data.loc[train_data.Parch == i]['Survived']
    parch_rate = sum(parch)/len(parch)
    print('number of parch {} :'.format(i),len(parch))
    print('proportion of parch {} who survived:'.format(i),parch_rate)
    print('-'*20)

number of parch 0 : 678
proportion of parch 0 who survived: 0.34365781710914456
--------------------
number of parch 1 : 118
proportion of parch 1 who survived: 0.5508474576271186
--------------------
number of parch 2 : 80
proportion of parch 2 who survived: 0.5
--------------------
number of parch 5 : 5
proportion of parch 5 who survived: 0.2
--------------------
number of parch 3 : 5
proportion of parch 3 who survived: 0.6
--------------------
number of parch 4 : 4
proportion of parch 4 who survived: 0.0
--------------------
number of parch 6 : 1
proportion of parch 6 who survived: 0.0
--------------------


In [233]:
for i in train_data.SibSp.unique():
    sibsp = train_data.loc[train_data.SibSp == i]['Survived']
    sibsp_rate = sum(sibsp)/len(sibsp)
    print('number of sibsp {} :'.format(i),len(sibsp))
    print('proportion of sibsp {} who survived:'.format(i),sibsp_rate)

number of sibsp 1 : 209
proportion of sibsp 1 who survived: 0.5358851674641149
number of sibsp 0 : 608
proportion of sibsp 0 who survived: 0.34539473684210525
number of sibsp 3 : 16
proportion of sibsp 3 who survived: 0.25
number of sibsp 4 : 18
proportion of sibsp 4 who survived: 0.16666666666666666
number of sibsp 2 : 28
proportion of sibsp 2 who survived: 0.4642857142857143
number of sibsp 5 : 5
proportion of sibsp 5 who survived: 0.0
number of sibsp 8 : 7
proportion of sibsp 8 who survived: 0.0


## 考虑到数量和显著性，发现SibSp和Parch两个变量都只有在值等于0时有效:police_car:
对几个显著影响存货的变量进行标签

In [234]:
train_data['is_Parch_0'] = train_data.Parch.apply(lambda x: 1 if x == 0 else 0)
test_data['is_Parch_0'] = test_data.Parch.apply(lambda x: 1 if x == 0 else 0)
train_data['is_SibSp_0'] = train_data.SibSp.apply(lambda x: 1 if x == 0 else 0)
test_data['is_SibSp_0'] = test_data.SibSp.apply(lambda x: 1 if x == 0 else 0)
train_data['is_Embarked_S'] = train_data.Embarked.apply(lambda x: 1 if x == 'S' else 0)
test_data['is_Embarked_S'] = test_data.Embarked.apply(lambda x: 1 if x == 'S' else 0)
train_data['is_Embarked_Q'] = train_data.Embarked.apply(lambda x: 1 if x == 'Q' else 0)
test_data['is_Embarked_Q'] = test_data.Embarked.apply(lambda x: 1 if x == 'Q' else 0)
# train_data['is_Embarked_3'] = train_data.Embarked.apply(lambda x: 1 if x == 3 else 0)
# test_data['is_Embarked_3'] = test_data.Embarked.apply(lambda x: 1 if x == 3 else 0)
train_data['gender'] = train_data.Sex.apply(lambda x: 1 if x == 'female' else 0)
test_data['gender'] = test_data.Sex.apply(lambda x: 1 if x == 'female' else 0)
train_data['is_Pclass_1'] = train_data.Pclass.apply(lambda x: 1 if x == 1 else 0)
test_data['is_Pclass_1'] = test_data.Pclass.apply(lambda x: 1 if x == 1 else 0)
test_data['is_Pclass_3'] = test_data.Pclass.apply(lambda x: 1 if x == 3 else 0)
train_data['is_Pclass_3'] = train_data.Pclass.apply(lambda x: 1 if x == 3 else 0)

train_data['is_cabin_null'] = train_data.Cabin.apply(lambda x: 1 if pd.isnull(x) else 0)
test_data['is_cabin_null'] = test_data.Cabin.apply(lambda x: 1 if pd.isnull(x) else 0)

In [235]:
is_cabin_null = train_data.loc[train_data.is_cabin_null == 1]['Survived']
is_cabin_null_rate = sum(is_cabin_null)/len(is_cabin_null)
print('number of people with null cabin:',len(is_cabin_null))
print(f'proportion of people with null cabin who survived:',is_cabin_null_rate)

number of people with null cabin: 687
proportion of people with null cabin who survived: 0.29985443959243085


In [236]:
train_data['is_teens'] = train_data.Age.apply(lambda x: 1 if x < 18 else 0)
test_data['is_teens'] = test_data.Age.apply(lambda x: 1 if x < 18 else 0)
train_data['is_adults'] = train_data.Age.apply(lambda x: 1 if x >= 18 and x <50 else 0)
test_data['is_adults'] = test_data.Age.apply(lambda x: 1 if x >= 18 and x <50 else 0)
train_data['is_elderly'] = train_data.Age.apply(lambda x: 1 if x >= 50 else 0)
test_data['is_elderly'] = test_data.Age.apply(lambda x: 1 if x >= 50 else 0)

## 由于要回归填补年龄，剩下的数量足够的变量也进行标签:kiss::fire::heart_eyes:

In [237]:
train_data['is_SibSp_1'] = train_data.SibSp.apply(lambda x: 1 if x == 1 else 0)
test_data['is_SibSp_1'] = test_data.SibSp.apply(lambda x: 1 if x == 1 else 0)
train_data['is_Pclass_2'] = train_data.Pclass.apply(lambda x: 1 if x == 2 else 0)   
test_data['is_Pclass_2'] = test_data.Pclass.apply(lambda x: 1 if x == 2 else 0)
train_data['is_Parch_1'] = train_data.Parch.apply(lambda x: 1 if x == 1 else 0)
test_data['is_Parch_1'] = test_data.Parch.apply(lambda x: 1 if x == 1 else 0)

## age存在缺失值，但极为重要，因此对缺失值进行回归填补
使用mlb和GradientBoostingRegressor

In [238]:
test_data[test_data['Fare'].isnull()].head()
#train_data[train_data['Fare'].isnull()].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_Parch_0,is_SibSp_0,is_Embarked_S,is_Embarked_Q,gender,is_Pclass_1,is_Pclass_3,is_cabin_null,is_teens,is_adults,is_elderly,is_SibSp_1,is_Pclass_2,is_Parch_1
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,1,1,1,0,0,0,1,1,0,0,1,0,0,0


In [239]:
fare = train_data.loc[train_data.SibSp == 0]
fare = fare.loc[fare.Parch == 0]
fare = fare.loc[fare.Pclass == 3]
fare = fare.loc[fare.gender == 0]
fare = fare.loc[fare.Embarked == 'S']
fare['Fare'].mean()

9.861667821782177

In [240]:
test_data.loc[test_data['Fare'].isnull(),'Fare'] = fare['Fare'].mean()
test_data.iloc[152]

PassengerId                    1044
Pclass                            3
Name             Storey, Mr. Thomas
Sex                            male
Age                            60.5
SibSp                             0
Parch                             0
Ticket                         3701
Fare                       9.861668
Cabin                           NaN
Embarked                          S
is_Parch_0                        1
is_SibSp_0                        1
is_Embarked_S                     1
is_Embarked_Q                     0
gender                            0
is_Pclass_1                       0
is_Pclass_3                       1
is_cabin_null                     1
is_teens                          0
is_adults                         0
is_elderly                        1
is_SibSp_1                        0
is_Pclass_2                       0
is_Parch_1                        0
Name: 152, dtype: object

In [241]:
age_features = ['is_teen', 'gender', 'Fare', 'is_Pclass_1', 'is_Pclass_2', 'is_Pclass_3', 'is_SibSp_0', 'is_SibSp_1', 'is_Parch_0', 'is_Parch_1', 'is_Embarked_S', 'is_Embarked_Q']
X_train = train_data[age_features]
X_test = test_data[age_features]
merged = pd.concat([X_train, X_test])
merged

Unnamed: 0,Age,gender,Fare,is_Pclass_1,is_Pclass_2,is_Pclass_3,is_SibSp_0,is_SibSp_1,is_Parch_0,is_Parch_1,is_Embarked_S,is_Embarked_Q
0,22.0,0,7.2500,0,0,1,0,1,1,0,1,0
1,38.0,1,71.2833,1,0,0,0,1,1,0,0,0
2,26.0,1,7.9250,0,0,1,1,0,1,0,1,0
3,35.0,1,53.1000,1,0,0,0,1,1,0,1,0
4,35.0,0,8.0500,0,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,,0,8.0500,0,0,1,1,0,1,0,1,0
414,39.0,1,108.9000,1,0,0,1,0,1,0,0,0
415,38.5,0,7.2500,0,0,1,1,0,1,0,1,0
416,,0,8.0500,0,0,1,1,0,1,0,1,0


In [242]:
# target_data = merged.loc[merged['Age'].isnull()]
# source_data = merged.loc[merged['Age'].notnull()]
# Y = source_data['Age']
# X = source_data.drop('Age', axis=1)


In [243]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [244]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [245]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
MLP_regressor = MLPRegressor(activation='tanh', learning_rate='adaptive')
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

In [246]:
gb_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 129.13774039965114


In [247]:
MLP_regressor.fit(X_train, y_train)

y_pred = MLP_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

Mean Squared Error: 147.12833092545685




In [248]:
rf_regressor.fit(X_train, y_train)

# 使用训练好的模型进行预测
y_pred = rf_regressor.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 141.70209921959795
