# Cluster Titanic Data (w/o labels) into two groups

### 1. Imports:

In [1]:
import pandas as pd
import numpy as np

### 2. Data: 

In [2]:
df = pd.read_excel('titanic.xls')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### 3. Little needed preprocessing:

In [3]:
df.drop(['body','name'], 1, inplace = True)
df.fillna(0, inplace = True)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"


In [4]:
def convert_non_numeric(dataframe):
    columns = dataframe.columns.values
    for column in columns:
        digits_assigned = {}
        def convert_to_int(word):
            return digits_assigned[word]
        if dataframe[column].dtype != np.int64 and dataframe[column].dtype !=np.float64:
            column_contents = dataframe[column].values.tolist()
            unique_elements = set(column_contents)
            x=0
            for unique in unique_elements:
                if unique not in digits_assigned:
                    digits_assigned[unique] = x
                    x+=1
            dataframe[column] = list(map(convert_to_int,dataframe[column])) #maps a vector using a function  
    return dataframe

In [5]:
df = convert_non_numeric(df)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,1,29.0,0,0,764,211.3375,63,3,2,27
1,1,1,0,0.9167,1,2,523,151.55,181,3,10,11
2,1,0,1,2.0,1,2,523,151.55,181,3,0,11
3,1,0,0,30.0,1,2,523,151.55,181,3,0,11
4,1,0,1,25.0,1,2,523,151.55,181,3,0,11


### 4. Extract variable and labels

In [6]:
X = np.array(df.drop(['survived'], 1).astype(float)) #remove labels, convert to float array
y = np.array(df['survived'])

### 5. Scale X values

In [7]:
from sklearn import preprocessing
X = preprocessing.scale(X)
#print(X[1:5,:])

### 6. Fit the clustering algo on variables:

In [8]:
from sklearn.cluster import KMeans
clf = KMeans(n_clusters = 2)
clf.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

### 7. Check against labels:

In [9]:
correct = 0
for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1,len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == y[i]:
        correct += 1
print(correct/len(X))


0.7639419404125286
