# Preprocessing(Imputation and Encoding)

In [5]:
import torch
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [7]:
raw_train = pd.read_csv("train.csv")
raw_test = pd.read_csv("test.csv")
raw_gen_sub = pd.read_csv("gender_submission.csv")

In [8]:
raw_train.shape

(891, 12)

In [9]:
numeric_cols = list(raw_train.select_dtypes(include=np.number).keys())

In [10]:
object_cols = list(raw_train.select_dtypes(include=np.object_).keys())

In [11]:
numeric_cols,object_cols

(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'])

In [12]:
age_mean = raw_train["Age"].mean().item()

In [13]:
raw_train["Age"].fillna(age_mean,inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_train["Age"].fillna(age_mean,inplace = True)


In [14]:
raw_train[numeric_cols].isna().sum().sum()

np.int64(0)

In [15]:
raw_train[object_cols].isna().sum()

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

In [16]:
#Cabin imputing
cabin_mode = raw_train["Cabin"].describe().top

#Embarked imputing
embarked_mode =  raw_train["Embarked"].describe().top

In [17]:
#Imputation for cabin and embarked
raw_train["Cabin"].fillna(cabin_mode,inplace=True)
raw_train["Embarked"].fillna(embarked_mode,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_train["Cabin"].fillna(cabin_mode,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_train["Embarked"].fillna(embarked_mode,inplace=True)


In [18]:
raw_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [19]:
input_new_cols = ['Pclass','female','male','Age',"SibSp",'Parch','C','Q','S']
input_cols = ['Pclass','Sex','Age',"SibSp",'Parch','Embarked']

In [20]:
input_train = raw_train[input_cols]

In [21]:
embark_encoder = OneHotEncoder()

In [22]:
embark_encoder_cols = embark_encoder.fit_transform(input_train[["Embarked"]])

In [23]:
raw_train[['C','Q','S']] = embark_encoder_cols.toarray()

In [24]:
raw_train.drop("Embarked",inplace=True,axis=1)

In [25]:
gen_encoder = OneHotEncoder()

In [26]:
gen_encoder_cols = gen_encoder.fit_transform(input_train[["Sex"]])

In [27]:
raw_train[['female','male']] = gen_encoder_cols.toarray()

In [28]:
raw_train.drop("Sex",axis=1,inplace = True)

# Model training

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
model1 = LogisticRegression()

In [31]:
model1.fit(raw_train[input_new_cols],raw_train["Survived"])

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
model1.coef_

array([[-1.14366078,  1.27487606, -1.39223115, -0.03990982, -0.31690404,
        -0.06331399,  0.15024382,  0.0554748 , -0.32307371]])

In [33]:
raw_test[['Pclass',"Sex",'Age',"SibSp",'Parch',"Embarked"]].isna().sum().sum()

np.int64(86)

# Test set preprocessing

In [34]:
test_gen = raw_test["Sex"].describe().top
test_age = np.round(raw_test["Age"].mean().item()).item()
test_embarked = raw_test["Embarked"].describe().top

In [35]:
raw_test["Sex"].fillna(test_gen,inplace = True)
raw_test["Age"].fillna(test_age,inplace = True)
raw_test["Embarked"].fillna(test_embarked,inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_test["Sex"].fillna(test_gen,inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_test["Age"].fillna(test_age,inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [36]:
gen_cols = gen_encoder.transform(raw_test[["Sex"]])
embarked_cols = embark_encoder.transform(raw_test[["Embarked"]])

In [37]:
raw_test[['female','male']] = gen_cols.toarray()
raw_test[['C','Q','S']] = embarked_cols.toarray()

In [38]:
raw_test.drop(['Sex','Embarked'],axis = 1,inplace=True)

In [39]:
pred1 = model1.predict(raw_test[input_new_cols])

In [40]:
submission1 = pd.DataFrame({
    "PassengerId":raw_test["PassengerId"],
    "Survived":pred1
})

In [41]:
submission1.to_csv("submission1.csv",index=False)


# Improving the model 

In [42]:
from xgboost import XGBClassifier

In [43]:
xgb_class = XGBClassifier()
xgb_class.fit(raw_train[input_new_cols],raw_train["Survived"])

In [44]:
pred2 = xgb_class.predict(raw_test[input_new_cols])

In [45]:
submission2 = pd.DataFrame({
    "PassengerId":raw_test["PassengerId"],
    "Survived":pred2
})

In [46]:
submission2.to_csv("submission2.csv",index=False)

In [48]:
train_new = raw_train.copy()[input_new_cols]
train_new["Survived"] = raw_train.copy()["Survived"]

In [51]:
test_new = raw_test.copy()[input_new_cols]

In [52]:
train_new.to_csv("train_new.csv",index=False)
test_new.to_csv("test_new.csv",index=False)