* Here, we will use the Mean Shift algorithm on our titanic dataset. 
* Our aim is to classify people that were on the titanic based on their survival rate.
* A survival rate of 1 means survived and 0 means didn't survive

## Importing necessary libraries

In [52]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


## Importing our dataset
* Make sure your dataset is in the same folder with your jupyter notebook if you want to import it as done in this notebook.

In [53]:
data = pd.read_excel('titanic.xls')
original_data = pd.DataFrame.copy(data)

* Let's see what our data looks like

In [54]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [55]:
data.describe(include="all")

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
count,1309.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,295,1307,486.0,121.0,745
unique,,,1307,2,,,,939,,186,3,28.0,,369
top,,,"Connolly, Miss. Kate",male,,,,CA. 2343,,C23 C25 C27,S,13.0,,"New York, NY"
freq,,,2,843,,,,11,,6,914,39.0,,64
mean,2.294882,0.381971,,,29.881135,0.498854,0.385027,,33.295479,,,,160.809917,
std,0.837836,0.486055,,,14.4135,1.041658,0.86556,,51.758668,,,,97.696922,
min,1.0,0.0,,,0.1667,0.0,0.0,,0.0,,,,1.0,
25%,2.0,0.0,,,21.0,0.0,0.0,,7.8958,,,,72.0,
50%,3.0,0.0,,,28.0,0.0,0.0,,14.4542,,,,155.0,
75%,3.0,1.0,,,39.0,1.0,0.0,,31.275,,,,256.0,


# Handling Non Numeric Data
* We want to use this data to train our model but we  can see that we have some non numerical variables that are actually important features e.g. Sex, and our ML algorithm can only work with numerical data. So what do we do

* You take the column with the non numerical data, then put it in a list, then take the set of the list, the set is taking the unique values in the list, in the Sex column we have 2 unique values: Male and Female, then we assign a unique numerical id to each unique value e.g. Male=0, Female=1 if we had 3rd Unique value e.g Other=2....

* If you have a lot of unique values in a column, you will also have a lot of unique numerical ids and this you will have outliers too which will cause trouble. If you do preprocessing though you will be fine

In [56]:
#Dropping the body and name features
data.drop(['body', 'name'], 1, inplace=True) 

#Filling all missing data with 0
data._convert(numeric=True)
data.fillna(0, inplace=True, )

* Function for handling non-numeric data

In [57]:
def handle_non_numerical_data(data):
    """A simple function for handling non numeric data in our titanic dataset"""
    columns = data.columns.values #This just picks each column in our dataset and stores them in a list
    
    for  column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        if data[column].dtype != np.int64 and data[column].dtype != np.float64:
            column_contents = data[column].values.tolist()
            unique_elements = set(column_contents)  #All the unique non repititive values in the non numerical column
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
            data[column] = list(map(convert_to_int, data[column]))
    return data

data = handle_non_numerical_data(data)
data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,0,29.0,0,0,732,211.3375,88,1,2,251
1,1,1,1,0.9167,1,2,520,151.55,158,1,25,4
2,1,0,0,2.0,1,2,520,151.55,158,1,0,4
3,1,0,1,30.0,1,2,520,151.55,158,1,0,4
4,1,0,0,25.0,1,2,520,151.55,158,1,0,4


* We will be dropping the "boat" feature now

In [58]:
data.drop(['boat'], 1, inplace=True)
X = np.array(data.drop(['survived'], 1).astype(float) ) #Selecting the features that will be used on our model
X = preprocessing.scale(X)
y = np.array(data['survived'])

## Modelling 

In [59]:
clf = MeanShift()
clf.fit(X)

labels = clf.labels_
cluster_centers = clf.cluster_centers_

#NOw we will add a new column to our original dataframe
original_data['cluster_group'] = np.nan

#Now we will iterate throught the labels, and then we will populate new column above with the values of those labels
for i in range(len(X)):
    original_data['cluster_group'].iloc [i] = labels[i] #here with iloc we are referencing each row in the column 'cluster_group'
    #and we are setting the value of each row as the value of the label associated with it, it starts from i = 0
    
n_clusters_ = len(np.unique(labels)) #the number of unique values we have is the number of clusters we have.

survival_rates = {}
for i in range (n_clusters_):
    temp_data = original_data[ (original_data['cluster_group'] == float(i)) ] #We created a new dataframe where the values in it
    #signify the clusters
    survival_cluster = temp_data[(temp_data['survived']==1)]  #this dataframe is a one in which survived = 1
    survival_rate = len(survival_cluster)/len(temp_data)
    survival_rates[i] = survival_rate
    
print(survival_rates)    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


{0: 0.375, 1: 1.0, 2: 0.1111111111111111, 3: 0.6333333333333333}


# NOTE: The number of clusters you have might be differ.

* We can see above that we have 4 clusters with thier corresponding survival rates (which is our label), cluste 0, cluster 1, cluster 2 and cluster 3

* Cluster2 probably contains people from pclass=3 because they had the worst survival rate, and 
* cluster1 and cluster 3 probably contains people from pclass=1, as we know from previous analysis, they had the best survival rate. The other cluster is probably a mixture of the pclass

### Let's see what cluster 2 looks like

In [70]:
original_data[(original_data['cluster_group']==1)].head() 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster_group
35,1,1,"Bowen, Miss. Grace Scott",female,45.0,0,0,PC 17608,262.375,,C,4,,"Cooperstown, NY",1.0
49,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,3,,"Austria-Hungary / Germantown, Philadelphia, PA",1.0
50,1,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C,3,,"Germantown, Philadelphia, PA",1.0
66,1,1,"Chaudanson, Miss. Victorine",female,36.0,0,0,PC 17608,262.375,B61,C,4,,,1.0
183,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,3,,,1.0


In [71]:
original_data[(original_data['cluster_group']==1)].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,6.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0
mean,1.0,1.0,40.833333,0.0,0.333333,429.011133,,1.0
std,0.0,0.0,9.239408,0.0,0.516398,129.075794,,0.0
min,1.0,1.0,35.0,0.0,0.0,262.375,,1.0
25%,1.0,1.0,35.25,0.0,0.0,324.86355,,1.0
50%,1.0,1.0,36.0,0.0,0.0,512.3292,,1.0
75%,1.0,1.0,42.75,0.0,0.75,512.3292,,1.0
max,1.0,1.0,58.0,0.0,1.0,512.3292,,1.0


* Now in cluster 3, we want to see the survival rate of only the people that belong to pclass = 1

In [72]:
original_data[(original_data.cluster_group==3) &(original_data.pclass==1)].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,30.0,30.0,30.0,30.0,30.0,30.0,2.0,30.0
mean,1.0,0.633333,35.697223,1.066667,1.733333,216.087917,115.5,3.0
std,0.0,0.490133,19.324888,0.980265,0.980265,51.593825,27.577164,0.0
min,1.0,0.0,0.9167,0.0,0.0,83.1583,96.0,3.0
25%,1.0,0.0,23.25,0.0,1.0,176.4844,105.75,3.0
50%,1.0,1.0,29.5,1.0,2.0,221.7792,115.5,3.0
75%,1.0,1.0,50.0,1.0,2.0,262.375,125.25,3.0
max,1.0,1.0,67.0,3.0,4.0,263.0,135.0,3.0
