# Preliminary Data Analysis

In [166]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [167]:
training_set_location = "train.csv"
train_raw = pd.read_csv(training_set_location)

test_set_location = "test.csv"
test_raw = pd.read_csv(test_set_location)

train_raw.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


We explain the columns of the dataset

- Survived if passenger survives
- PClass Ticket class = 1, 2, 3
- Sex = male/female
- Age = age in years, 0.5 if estimate
- Sibsp = siblings/spouses, Parch = parents/children
- ticket = ticket number
- Passenger fare
- Cabin number
- Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

## Missing values
We analyze the missing values in the training data.

In [169]:
train_raw.notna().all() # the following code indicates that only Age, Cabin, and Embarked have missing values

PassengerId     True
Survived        True
Pclass          True
Name            True
Sex             True
Age            False
SibSp           True
Parch           True
Ticket          True
Fare            True
Cabin          False
Embarked       False
dtype: bool

In [170]:
print(f"Number of passengers without known age: {len(train_raw.loc[train_raw["Age"].isna()])} of {len(train_raw)}")
train_raw.loc[train_raw["Age"].isna()]

Number of passengers without known age: 177 of 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


The following code deals with the columns with NaNs

We now specify how we deal with missing values:
- We drop the cabin number, temporarily. This column isn't clean, some passengers have multiple.
- We impute the Age with the average age. We add a binary column, AgeKnown, if the age is exactly known.
- We create a new category for Embarked, M = missing

In [171]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_features = ["Age", "Fare", "SibSp", "Parch", "Fare"]
categorical_features = ["Sex", "Embarked", "AgeKnown"]

train_raw["AgeKnown"] = ((train_raw["Age"] % 1.0) == 0.0)|(train_raw["Age"] < 1.0)
test_raw["AgeKnown"] = ((test_raw["Age"] % 1.0) == 0.0)|(test_raw["Age"] < 1.0)


numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="M")),
    ("encoder", OneHotEncoder(drop="first"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


## Derived features
In this section, we explain certain derived features

- Add .5 category, as this indicates uncertainty in the age

## Logistic Regression Model

In this section, we pass our transformed data through a logistic regression model to predict whether someone survived

In [172]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

X_train = train_raw.drop(columns=["PassengerId", "Survived", "Name", "Ticket", "Cabin"])
y_train = train_raw["Survived"]
X_test = test_raw.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

In [173]:
clf.fit(X_train, y_train)

logreg = clf.named_steps["model"]
feature_names = clf.named_steps["preprocessor"].get_feature_names_out()

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": logreg.coef_[0]
})
print(coef_df)



              feature  coefficient
0            num__Age    -0.261130
1           num__Fare     0.350246
2          num__SibSp    -0.411786
3          num__Parch    -0.191910
4           num__Fare     0.350246
5       cat__Sex_male    -2.525008
6     cat__Embarked_M     0.172091
7     cat__Embarked_Q    -0.328914
8     cat__Embarked_S    -0.562114
9  cat__AgeKnown_True     0.607702


In [174]:
y_pred = pd.Series(clf.predict(X_test))
pred_df = pd.DataFrame({"PassengerId": test_raw["PassengerId"], "Survived": y_pred})

In [175]:
pred_df.to_csv("submission.csv", index=False)
pred_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [176]:
!ls

ModuleNotFoundError: No module named 'pexpect'