In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
train = pd.read_csv("train.csv")

In [2]:
todrop = ['PassengerId','Ticket','Name','Cabin']
for i in train.columns:
    if i in todrop:
        train = train.drop(i,axis=1)

In [3]:
# profile = ProfileReport(train, title="Profiling Report",html = {'style':{'full_width':True}})
# profile.to_file("output.html")

In [4]:
train = train.dropna(subset=['Embarked'])
train['Age'] = train['Age'].fillna(train['Age'].median())

In [5]:
train = train.drop_duplicates(keep='first')
train.shape

(773, 8)

In [6]:
train['Survived'].value_counts()

Survived
0    455
1    318
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lab_enc = LabelEncoder()
train['Sex_encoded'] = lab_enc.fit_transform(train['Sex'])
train['Embarked_encoded'] = lab_enc.fit_transform(train['Embarked'])
train = train.drop(columns=['Embarked','Sex'])

In [8]:
column_to_move = 'Survived'
new_column_order = [col for col in train.columns if col != column_to_move] + [column_to_move]
train = train[new_column_order]
train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_encoded,Embarked_encoded,Survived
0,3,22.0,1,0,7.25,1,2,0
1,1,38.0,1,0,71.2833,0,0,1
2,3,26.0,0,0,7.925,0,2,1
3,1,35.0,1,0,53.1,0,2,1
4,3,35.0,0,0,8.05,1,2,0


In [9]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
X = train.drop("Survived", axis=1)
y = train["Survived"]
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [10]:
from sklearn.preprocessing import StandardScaler

index_column = X_resampled.index


scaler = StandardScaler()
data_standardized = scaler.fit_transform(X_resampled)


data_standardized_df = pd.DataFrame(data_standardized, columns=X_resampled.columns, index=index_column)


print(data_standardized_df)

       Pclass       Age     SibSp     Parch      Fare  Sex_encoded  \
0    0.932161 -0.514085  0.482012 -0.541010 -0.533986     0.850579   
1   -1.388677  0.635988  0.482012 -0.541010  0.588695    -1.175670   
2    0.932161 -0.226567 -0.557397 -0.541010 -0.522152    -1.175670   
3   -1.388677  0.420349  0.482012 -0.541010  0.269892    -1.175670   
4    0.932161  0.420349 -0.557397 -0.541010 -0.519960     0.850579   
..        ...       ...       ...       ...       ...          ...   
905 -1.388677 -0.370326 -0.557397 -0.541010  0.206847    -1.175670   
906  0.932161  0.204711 -0.557397 -0.541010 -0.522152     0.850579   
907  0.932161 -0.154687 -0.557397  1.814583 -0.465901    -1.175670   
908  0.932161  0.492229  0.482012 -0.541010 -0.356029    -1.175670   
909 -1.388677  0.851627 -0.557397 -0.541010  1.697060    -1.175670   

     Embarked_encoded  
0            0.604594  
1           -1.835231  
2            0.604594  
3            0.604594  
4            0.604594  
..             

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_standardized_df, y_resampled, test_size=0.33, random_state=42)


In [12]:
from xgboost import XGBClassifier
model = XGBClassifier()  
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.8338870431893688

In [13]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [15]:
PId = test['PassengerId']
test = test.drop(columns=todrop,axis=1)
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [16]:
PId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [18]:
test['Sex_encoded'] = lab_enc.fit_transform(test['Sex'])
test = test.drop(columns=['Sex'])
test['Embarked_encoded'] = lab_enc.fit_transform(test['Embarked'])
test = test.drop(columns=['Embarked'])

In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Pclass            418 non-null    int64  
 1   Age               332 non-null    float64
 2   SibSp             418 non-null    int64  
 3   Parch             418 non-null    int64  
 4   Fare              417 non-null    float64
 5   Sex_encoded       418 non-null    int32  
 6   Embarked_encoded  418 non-null    int32  
dtypes: float64(2), int32(2), int64(3)
memory usage: 19.7 KB


In [20]:
test = test.fillna(test.median())

In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Pclass            418 non-null    int64  
 1   Age               418 non-null    float64
 2   SibSp             418 non-null    int64  
 3   Parch             418 non-null    int64  
 4   Fare              418 non-null    float64
 5   Sex_encoded       418 non-null    int32  
 6   Embarked_encoded  418 non-null    int32  
dtypes: float64(2), int32(2), int64(3)
memory usage: 19.7 KB


In [22]:
test_sd = scaler.fit_transform(test)
pred = model.predict(test_sd)
pred

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [23]:
PId

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [24]:
sur = pd.DataFrame({'Survived': pred})
PId = pd.DataFrame({'PassengerId': PId})

In [25]:
result = PId.merge(sur, left_index=True, right_index=True)
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [26]:
result.to_csv("submission.csv",index=False)