In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from sklearn import preprocessing

In [2]:
"""
pclass : passenger class (1,2,3)
survived : (1-yes, 0-no)
name : name of passenger
sex : sex of passenger
age : age of passenger
sibsp : no of siblings
parch : no parents/chidren
ticket: ticket no
fare : fare
cabin : cabinno
embarked : port of embarkment (C- cherbourg, Q-queenstown, S-southampton)
boat : lifeboat
body: body identification no
home.dest : home/destination
"""
df = pd.read_excel('titanic.xls')
df_original = pd.DataFrame.copy(df)
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
df.drop(['body','name'],axis=1,inplace=True)

In [4]:
df.convert_objects(convert_numeric=True)
df.fillna(0,inplace=True)
df.head(5)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"


In [5]:
def handle_non_numeric(df):
    columns = df.columns.values
    for column in columns:
        text_digit_values = {}
        def convert_to_int(val):
            return text_digit_values[val]
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x=0
            for unique in unique_elements:
                text_digit_values[unique] = x
                x+=1
            df[column] = list(map(convert_to_int,df[column]))
    return df

In [6]:
df_numeric = handle_non_numeric(df)
df_numeric.head(5)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,0,29.0,0,0,769,211.3375,177,3,1,76
1,1,1,1,0.9167,1,2,516,151.55,42,3,23,146
2,1,0,0,2.0,1,2,516,151.55,42,3,0,146
3,1,0,1,30.0,1,2,516,151.55,42,3,0,146
4,1,0,0,25.0,1,2,516,151.55,42,3,0,146


In [7]:
y = np.array(df_numeric.survived)
x = np.array(df_numeric.drop(['survived'],1)).astype('float')
x=preprocessing.scale(x)
print(x.shape,y.shape)

(1309, 11) (1309,)


In [17]:
from sklearn.cluster import MeanShift
clf = MeanShift()
clf.fit(x)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [18]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_
n_clusters = len(cluster_centers)
df_original['cluster_group']= np.NaN

In [19]:
for i in range(len(x)):
    df_original['cluster_group'].iloc[i] = labels[i]
df_original.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO",0.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",0.0
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.0
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0.0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0.0


In [20]:
survival_rates = {}
for i in range(n_clusters):
    temp_df = df_original[df_original['cluster_group']==float(i)]
    survival_cluster = temp_df[temp_df['survived']==1]
    survival_rate = len(survival_cluster)/len(temp_df)
    survival_rates[i] = survival_rate

In [21]:
print(survival_rates)

{0: 0.38177533385703066, 1: 0.0625, 2: 0.8666666666666667, 3: 0.0}
