In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# 读取训练集
train = pd.read_csv('train.csv')
# 读取测试集
test = pd.read_csv('test.csv')

In [3]:
# 查看缺失值情况
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
# 查看测试集缺失值情况
test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [5]:
# 把两个拼接在一起处理缺失值
data = pd.concat([train,test],axis=0,ignore_index=True)
data.isnull().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
dtypes: float64(6), object(8)
memory usage: 1.4+ MB


In [7]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [8]:
# 连续型变量用均值填充
data['Age'] = data['Age'].fillna(data['Age'].mean())
data['RoomService'] = data['RoomService'].fillna(data['RoomService'].mean())
data['FoodCourt'] = data['FoodCourt'].fillna(data['FoodCourt'].mean())
data['ShoppingMall']=data['ShoppingMall'].fillna(data['ShoppingMall'].mean())
data['Spa']= data['Spa'].fillna(data['Spa'].mean())
data['VRDeck']=data['VRDeck'].fillna(data['VRDeck'].mean())

In [9]:
data.isnull().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age                0
VIP              296
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name             294
Transported     4277
dtype: int64

In [10]:
# 用众数填充
data['Destination'] = data['Destination'].fillna(data['Destination'].mode()[0])
data['HomePlanet']=data['HomePlanet'].fillna(data['HomePlanet'].mode()[0])
data['CryoSleep']=data['CryoSleep'].fillna(data['CryoSleep'].mode()[0])
data['VIP']=data['VIP'].fillna(data['VIP'].mode()[0])
data['Cabin']=data['Cabin'].fillna(data['Cabin'].mode()[0])

In [11]:
data.isnull().sum()

PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin              0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name             294
Transported     4277
dtype: int64

名字对于传送是否成功感觉没什么大用不补充了直接去掉

In [12]:
#去掉Name和PassengerId
data.drop(['Name'],axis=1,inplace=True)

对于非数值型的数据进行编码

In [13]:
# 将非数值型变量转换为数值型变量
le = LabelEncoder()
data['Destination'] = le.fit_transform(data['Destination'])
data['HomePlanet'] = le.fit_transform(data['HomePlanet'])
data['Cabin'] = le.fit_transform(data['Cabin'])

data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,1,False,208,2,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,0,False,3241,2,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,1,False,1,2,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,1,False,1,2,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,0,False,3243,2,16.0,False,303.0,70.0,151.0,565.0,2.0,True


把三个都是true,false转化为0，1

In [15]:
# 都是true false的变量转换为0 1
data['VIP'] = data['VIP'].map({True:1,False:0})
data['CryoSleep'] = data['CryoSleep'].map({True:1,False:0})

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12970 non-null  int64  
 2   CryoSleep     12970 non-null  int64  
 3   Cabin         12970 non-null  int64  
 4   Destination   12970 non-null  int64  
 5   Age           12970 non-null  float64
 6   VIP           12970 non-null  int64  
 7   RoomService   12970 non-null  float64
 8   FoodCourt     12970 non-null  float64
 9   ShoppingMall  12970 non-null  float64
 10  Spa           12970 non-null  float64
 11  VRDeck        12970 non-null  float64
 12  Transported   8693 non-null   object 
dtypes: float64(6), int64(5), object(2)
memory usage: 1.3+ MB


In [17]:
#把处理好的数据写回训练集和测试集
train = data[:train.shape[0]]
test = data[train.shape[0]:]

In [18]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,1,0,208,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,0,0,3241,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,0,0,3243,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True


In [19]:
train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   int64  
 2   CryoSleep     8693 non-null   int64  
 3   Cabin         8693 non-null   int64  
 4   Destination   8693 non-null   int64  
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   int64  
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Transported   8693 non-null   object 
dtypes: float64(6), int64(5), object(2)
memory usage: 883.0+ KB


In [20]:
test.isnull().sum()

PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin              0
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Transported     4277
dtype: int64

ID显然也没什么大用去掉

In [21]:
# PassengerId是无用的特征，去掉
test.drop(['PassengerId'],axis=1,inplace=True)
train.drop(['PassengerId'],axis=1,inplace=True)

In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 8693 to 12969
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   int64  
 1   CryoSleep     4277 non-null   int64  
 2   Cabin         4277 non-null   int64  
 3   Destination   4277 non-null   int64  
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   int64  
 6   RoomService   4277 non-null   float64
 7   FoodCourt     4277 non-null   float64
 8   ShoppingMall  4277 non-null   float64
 9   Spa           4277 non-null   float64
 10  VRDeck        4277 non-null   float64
 11  Transported   0 non-null      object 
dtypes: float64(6), int64(5), object(1)
memory usage: 401.1+ KB


In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   int64  
 1   CryoSleep     8693 non-null   int64  
 2   Cabin         8693 non-null   int64  
 3   Destination   8693 non-null   int64  
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int64  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   object 
dtypes: float64(6), int64(5), object(1)
memory usage: 815.1+ KB


把训练集里面的transported改成0 1 

In [24]:
train['Transported'] = data['Transported'].map({True:1,False:0})

In [25]:
# 划分训练集和测试集
X = train.drop(['Transported'],axis=1)
y = train['Transported']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

Lazypredict全部跑一遍

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

# 用lazypredict跑一下
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=12,
                     classifiers='all')

models,predictions = clf.fit(X_train,X_test,y_train,y_test)


In [None]:
# 展示一下结果
models

In [None]:
# 用catboost跑一下
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import balanced_accuracy_score

model = CatBoostClassifier(iterations=1000,
                            learning_rate=0.1,
                            depth=6,
                            loss_function='Logloss',
                            verbose=True)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [None]:
# 输出准确率
accuracy_score(y_test,y_pred)

In [None]:
# 混淆矩阵
print(confusion_matrix(y_test,y_pred))

# 准确率
accuracy_score(y_test,y_pred)

# f1_score
f1_score(y_test,y_pred)

# auc
roc_auc_score(y_test,y_pred)

# 画roc曲线
fpr,tpr,thresholds = roc_curve(y_test,y_pred)

plt.plot(fpr,tpr)
plt.show()

用lazy里面最好的lightGBM来跑一下

In [None]:
#LGBM
from lightgbm import LGBMClassifier

model = LGBMClassifier(n_estimators=1000,
                        learning_rate=0.1,
                        max_depth=6,
                        verbose=1)

model.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [None]:
# accuracy
accuracy_score(y_test,y_pred)

In [None]:
# f1_score
f1_score(y_test,y_pred)

把lazy里面的可视化；了

In [None]:
line = px.line(data_frame= models ,y =["Accuracy"] , markers = True)
line.update_xaxes(title="Model",
              rangeslider_visible = False)
line.update_yaxes(title = "Accuracy")
line.update_traces(line_color="red")
line.update_layout(showlegend = True,
    title = {
        'text': 'Accuracy vs Model',
        'y':0.94,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

line.show()

CatBoost表现的比LGBM好

In [None]:
# 用Catboost的来预测测试集
model_cat = CatBoostClassifier(iterations=1000,
                            learning_rate=0.1,
                            depth=6,
                            loss_function='Logloss',
                            verbose=True)

model_cat.fit(X_train,y_train)

**SUBMISSION**

In [None]:
# submission
submission = pd.read_csv('submission.csv')
submission['Transported'] = model_cat.predict(test)
submission['Transported'] = submission['Transported'].map({1:True,0:False})
submission.to_csv('submission.csv',index=False)