In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
cat_columns = ['Pclass', 'Sex', 'Embarked'] # 'most_frequent'
numeric_columns = ['Age','SibSp','Parch','Fare'] # 'median'

In [5]:
from sklearn.impute import SimpleImputer
imputer_numeric = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

imputer_numeric.fit(df_train[numeric_columns])
imputer_cat.fit(df_train[cat_columns])

df_train[numeric_columns] = imputer_numeric.transform(df_train[numeric_columns])
df_test[numeric_columns] = imputer_numeric.transform(df_test[numeric_columns])

df_train[cat_columns] = imputer_cat.transform(df_train[cat_columns])
df_test[cat_columns] = imputer_cat.transform(df_test[cat_columns])

In [6]:
df_train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [7]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

# Encoding- OneHot

In [8]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit(df_train[cat_columns])

OneHotEncoder()

In [9]:
ohe.get_feature_names()



array(['x0_1', 'x0_2', 'x0_3', 'x1_female', 'x1_male', 'x2_C', 'x2_Q',
       'x2_S'], dtype=object)

In [10]:
df_train[ohe.get_feature_names()] = ohe.transform(df_train[cat_columns]).toarray()
df_test[ohe.get_feature_names()] = ohe.transform(df_test[cat_columns]).toarray()



In [11]:
df_train[numeric_columns+list(ohe.get_feature_names())]



Unnamed: 0,Age,SibSp,Parch,Fare,x0_1,x0_2,x0_3,x1_female,x1_male,x2_C,x2_Q,x2_S
0,22.0,1.0,0.0,7.2500,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,26.0,0.0,0.0,7.9250,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,35.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
887,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
888,28.0,1.0,2.0,23.4500,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
889,26.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


# Scaling

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_train[numeric_columns])
df_train[numeric_columns] = scaler.transform(df_train[numeric_columns])
df_test[numeric_columns] = scaler.transform(df_test[numeric_columns])

In [13]:
df_train[numeric_columns+list(ohe.get_feature_names())].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        891 non-null    float64
 1   SibSp      891 non-null    float64
 2   Parch      891 non-null    float64
 3   Fare       891 non-null    float64
 4   x0_1       891 non-null    float64
 5   x0_2       891 non-null    float64
 6   x0_3       891 non-null    float64
 7   x1_female  891 non-null    float64
 8   x1_male    891 non-null    float64
 9   x2_C       891 non-null    float64
 10  x2_Q       891 non-null    float64
 11  x2_S       891 non-null    float64
dtypes: float64(12)
memory usage: 83.7 KB




In [14]:
df_test[numeric_columns+list(ohe.get_feature_names())].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        418 non-null    float64
 1   SibSp      418 non-null    float64
 2   Parch      418 non-null    float64
 3   Fare       418 non-null    float64
 4   x0_1       418 non-null    float64
 5   x0_2       418 non-null    float64
 6   x0_3       418 non-null    float64
 7   x1_female  418 non-null    float64
 8   x1_male    418 non-null    float64
 9   x2_C       418 non-null    float64
 10  x2_Q       418 non-null    float64
 11  x2_S       418 non-null    float64
dtypes: float64(12)
memory usage: 39.3 KB




# Model Building

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
model = LogisticRegression()
model.fit(df_train[numeric_columns+list(ohe.get_feature_names())], df_train['Survived'])
model.score(df_train[numeric_columns+list(ohe.get_feature_names())], df_train['Survived'])



0.8002244668911336

In [17]:
yp = model.predict(df_test[numeric_columns+list(ohe.get_feature_names())])
yp



array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [18]:
df_test['Survived'] = yp

In [19]:
df_test[['PassengerId','Survived']].to_csv('sub_std.csv', index=False)