In [249]:
import pandas as pd
from pathlib import Path


In [250]:
from enum import Enum

class Col_name(Enum):
    PCLASS = "pclass"
    SURVIVED = "survived"
    NAME = "name"
    SEX = "sex"
    AGE = "age"
    SIBSP = "sibsp"
    PARCH = "parch"
    TICKET = "ticket"
    FARE = "fare"
    CABIN = "cabin"
    EMBARKED = "embarked"
    BOAT = "boat"
    BODY = "body"
    HOME_DEST = "home.dest"


In [251]:
path = Path('./')
df = pd.read_csv(path/'titanic.csv')
print(df.head())


   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

      age  sibsp  parch  ticket      fare    cabin embarked boat body  \
0      29      0      0   24160  211.3375       B5        S    2    ?   
1  0.9167      1      2  113781    151.55  C22 C26        S   11    ?   
2       2      1      2  113781    151.55  C22 C26        S    ?    ?   
3      30      1      2  113781    151.55  C22 C26        S    ?  135   
4      25      1      2  113781    151.55  C22 C26        S    ?    ?   

                         home.dest  
0                     St Louis, MO  
1  Mon

In [252]:
def get_column(df, c: Col_name):
    return df[c.value]


def clean_numeric_inplace(df, col: str):
    df.loc[:, col] = pd.to_numeric(df[col], errors="coerce")  # non-numeric -> NaN

In [253]:
body = get_column(df, Col_name.BODY)

print("Before cleaning:")
print(df[Col_name.BODY.value].head()[:2])

clean_numeric_inplace(df, Col_name.BODY.value)
print("\nAfter cleaning:")
print(df[Col_name.BODY.value].head()[:2])

Before cleaning:
0    ?
1    ?
Name: body, dtype: object

After cleaning:
0    NaN
1    NaN
Name: body, dtype: object


In [254]:
#clean_numeric_inplace(df, Col_name.AGE.value)
count = df[Col_name.AGE.value].value_counts()
print("Before cleaning AGE:")
print(count.head()[:2])

clean_numeric_inplace(df, Col_name.AGE.value)
count = df[Col_name.AGE.value].value_counts()
print("\nAfter cleaning AGE:")
print(count.head()[:2])

Before cleaning AGE:
age
?     263
24     47
Name: count, dtype: int64

After cleaning AGE:
age
24.0    47
22.0    43
Name: count, dtype: int64


In [255]:
clean_numeric_inplace(df, Col_name.SURVIVED.value)
count = df[Col_name.SURVIVED.value].value_counts()
print("\nSurvived counts:")
print(count)

print("------------")

count = df[Col_name.SEX.value].value_counts()
print("\nSex counts:")
print(count)

print("------------")

print("\nPclass counts:")
count = df[Col_name.PCLASS.value].value_counts()
print(count)

print("------------")

fare = get_column(df, Col_name.FARE)
print("\nFare stats:")
print(fare.describe())


print("------------")

cabin = get_column(df, Col_name.CABIN)
count = df[Col_name.CABIN.value].value_counts()
print("\nCabin stats:") # Mycket ?
print(count)


print("------------")

embarked = get_column(df, Col_name.EMBARKED)
count = df[Col_name.EMBARKED.value].value_counts()
print("\nEmbarked stats:") # 2 ?
print(count)

print("------------")
boat = get_column(df, Col_name.BOAT)
count = df[Col_name.BOAT.value].value_counts()
print("\nBoat stats:") # Many ?
print(count)

print("------------")
home_dest = get_column(df, Col_name.HOME_DEST)
count = df[Col_name.HOME_DEST.value].value_counts()
print("\nHome.dest stats:") # Many ?
print(count)




Survived counts:
survived
0    809
1    500
Name: count, dtype: int64
------------

Sex counts:
sex
male      843
female    466
Name: count, dtype: int64
------------

Pclass counts:
pclass
3    709
1    323
2    277
Name: count, dtype: int64
------------

Fare stats:
count     1309
unique     282
top       8.05
freq        60
Name: fare, dtype: object
------------

Cabin stats:
cabin
?                  1014
C23 C25 C27           6
B57 B59 B63 B66       5
G6                    5
D                     4
                   ... 
F E46                 1
F E57                 1
F E69                 1
E10                   1
F38                   1
Name: count, Length: 187, dtype: int64
------------

Embarked stats:
embarked
S    914
C    270
Q    123
?      2
Name: count, dtype: int64
------------

Boat stats:
boat
?          823
13          39
C           38
15          37
14          33
4           31
10          29
5           27
3           26
9           25
11          25
8          