In [1]:
#Load all modules
import pandas as pd
import numpy as np
import random as rd
import re
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import random as rd

# Print options
np.set_printoptions(precision=4, threshold=10000, linewidth=160, edgeitems=999, suppress=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 160)
pd.set_option('expand_frame_repr', False)
pd.set_option('precision', 4)

In [2]:
#values of titanic set now need to be transformed. 

#Generate features from Cabin variable. Sparsely populated variable composed of letter&number:
def getCabinLetter(cabin):
    # finds letter component of Cabin variable (y axis of ship)
    match = re.compile("([a-zA-z]+)").search(cabin)
    if match:
        return match.group()
    else:
        return 'U'

def getCabinNumber(cabin):
    # finds the number component of Cabin variable (x axis of ship)
    match = re.compile("([0-9]+)").search(cabin)
    if match:
        return match.group()
    else:
        return '0'

def processCabin():
    global df
    df['Cabin'][df.Cabin.isnull()] = 'U0'#replace null values of cabine with 'U0'
    
    #map alphabetical part of Cabin to unique number
    df['CabinLetter'] = df['Cabin'].map(lambda x : re.compile("([a-zA-Z]+)").search(x).group())
    df['CabinLetter'] = pd.factorize(df['CabinLetter'])[0]
    
    #Create dummy variables for cabin letter: Cabin letter will prob denote position on ship (y axis)
    if keep_binary:
        cletters = pd.get_dummies(df['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x))
        df=pd.concat([df,cletters], axis=1)
        
    #Create feature for numerical portion of Cabin. number will denote xaxis on ship
    df['CabinNumber'] = df['Cabin'].map(lambda x: getCabinNumber(x)).astype(int) + 1
    if keep_scaled:
        scaler =preprocessing.StandardScaler()
        df['CabinNumber_scaled'] = scaler.fit_transform(df['CabinNumber'])

In [3]:
#Transform the Ticket variable
#Tickets are alphanumeric. Letters followed by numbers STON & SOTON prefixs are equivalent, '.' are also equivalents

#find letter component of ticket variable
def getTicketPrefix(ticket):
    match = re.compile("([a-zA-Z\.\/]+)").search(ticket)
    if match:
        return match.group()
    else:
        return 'U'
    
#find digit component of ticket variable
def getTicketNumber(ticket):
    match = re.compile("([\d]+$)").search(ticket)
    if match:
        return match.group()
    else:
        return '0'

def processTicket():
    global df
    df['TicketPrefix'] = df['Ticket'].map(lambda x: getTicketPrefix(x.upper()))
    df['TicketPrefix'] = df['TicketPrefix'].map(lambda x: re.sub('[\.?/\?]', '',x))
    df['TicketPrefix'] = df['TicketPrefix'].map(lambda x: re.sub('STON', 'SOTON', x))
    
    df['TicketPrefixID'] = pd.factorize(df['TicketPrefix'])[0]
    
    #dummy variables for ticket letter: Then drop the Prefix 
    if keep_binary:
        prefixes = pd.get_dummies(df['TicketPrefix']).rename(columns=lambda x: 'TicketPrefix_' + str(x))
        df = pd.concat([df, prefixes], axis =1)
    
    df.drop(['TicketPrefix'], axis=1, inplace = True)
    
    #get the ticket numbers, the number of digits and first digit of all tickets
    df['TicketNumber'] = df['Ticket'].map(lambda x: getTicketNumber(x))
    df['TicketNumberDigits'] = df['TicketNumber'].map(lambda x: len(x)).astype(np.int)
    df['TicketNumberStart'] = df['TicketNumber'].map(lambda x: x[0:1]).astype(np.int)
    
    df['TicketNumber'] = df.TicketNumber.astype(np.int)
    
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['TicketNumber_scaled'] = scaler.fit_transform(df['TicketNumber'])

In [4]:
#Transform the Embarked variable into dummy variables
def processEmbarked():    
    global df
    #replace missing values with common port & then map to number
    df.Embarked[df.Embarked.isnull()] = df.Embarked.dropna().mode().values
    df['Embarked'] = pd.factorize(df['Embarked'])[0]
    
    if keep_binary:
        df=pd.concat([df, pd.get_dummies(df['Embarked']).rename(columns= lambda x: 'Embarked_' + str(x))], axis=1)


In [5]:
#Transform Passenger Class. Replace missing values with Mode & create dummy variable & scale.
def processPClass():
    global df
    df.Pclass[df.Pclass.isnull()]=df.Pclass.dropna().mode().values
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Pclass']).rename(columns= lambda x: 'Pclass_' + str(x))], axis=1)
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Pclass_scaled'] = scaler.fit_transform(df['Pclass'])
        

In [6]:
#Tranform Sex variable into dummies
def processSex():
    global df
    df['Gender'] = np.where(df['Sex'] == 'male',1,0)

In [7]:
#Transform Age
from sklearn.ensemble import RandomForestRegressor

#populate missing ages via Random Forest Classifier
def SetMissingAges():
    #All features that will be included in Random Forest Regressor
    age_df = df[['Age', 'Embarked', 'Fare', 'Parch', 'SibSp', 'Title_id', 'Pclass', 'Names', 'CabinLetter']]
    #split between known & unknown Age
    known_age = age_df.loc[(df.Age.notnull())]
    unknown_age = age_df.loc[(df.Age.isnull())]
    
    y = known_age.values[:, 0] #takes age values of known and sets this as y component
    x = known_age.values[:, 1::] #takes the rest of the features and sets this as x component
    
    #create and fit the model
    model = RandomForestRegressor(n_estimators=2000, n_jobs=-1)
    model.fit(x, y)
    
    #Use model to predict missing values
    predictedAges = model.predict(unknown_age.values[:, 1::])
    
    #replace missing ages with predicted ages
    df.loc[(df.Age.isnull()), 'Age' ] = predictedAges 

#process Age
def processAge():
    global df
    SetMissingAges()
    
    #center mean and scale based on standard deviation
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Age_scaled'] = scaler.fit_transform(df['Age'])
    
    #dummy variable if subject is child. set child as age 13
    df['isChild'] = np.where(df.Age < 13, 1, 0)
    
    #bin into quartiles & create dummies
    df['Age_bin'] = pd.qcut(df['Age'], 4)
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Age_bin']).rename(columns=lambda x: 'Age_' + str(x))], axis=1)
    
    if keep_bins: #add 1 to bin so we start at 1 instead of 0
        df['Age_bin_id'] = pd.factorize(df['Age_bin'])[0]+1
        
    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Age_bin_id_scaled'] = scaler.fit_transform(df['Age_bin_id'])
    
    if not keep_strings:
        df.drop('Age_bin', axis=1, inplace=True)


In [8]:
#Transform ticket fare. Replace missing values with median fare. 

def processFare():
    global df
    df['Fare'][np.isnan(df['Fare'])] = df['Fare'].median()
    
    #split into quartiles
    
    df['Fare_bin'] = pd.qcut(df['Fare'],4)
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Fare_bin']).rename(columns= lambda x: 'Fare_' + str(x))], axis=1)
    
    if keep_bins:
        df['Fare_bin_id'] = pd.factorize(df['Fare_bin'])[0]+1
    
    if keep_scaled:
        scaler=preprocessing.StandardScaler()
        df['Fare_scaled'] = scaler.fit_transform(df['Fare'])
    
    if keep_bins and keep_scaled:
        scaler=preprocessing.StandardScaler()
        df['Fare_bin_id_scaled'] = scaler.fit_transform(df['Fare_bin_id'])
        
    if not keep_strings:
        df.drop('Fare_bin', axis =1, inplace = True)
    

In [9]:
#Transform Sib & Parch variables. Scale and dummy variables
def processFamily():
    global df
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['SibSp_scaled'] = scaler.fit_transform(df['SibSp'])
        df['Parch_scaled'] = scaler.fit_transform(df['Parch'])
        
    if keep_binary:
        sibsps = pd.get_dummies(df['SibSp']).rename(columns = lambda x: 'SibSp_' + str(x))
        parchs = pd.get_dummies(df['Parch']).rename(columns = lambda x: 'Parch_' + str(x))
        df = pd.concat([df, sibsps, parchs], axis=1)

In [10]:
#Transform the Name variable
def processName():
    global df
    #map of how many different names passenger has
    df['Names'] = df['Name'].map(lambda x: len(re.split(' ', x)))
    
    #map of each person's title
    df['Title'] = df['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
    
    df['Title'][df.Title == 'Jonkheer'] = 'Master'
    df['Title'][df.Title.isin(['Ms', 'Mlle'])] = 'Miss'
    df['Title'][df.Title == 'Mme'] = 'Mrs'
    df['Title'][df.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir'
    df['Title'][df.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'
    
    #dummy variables
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Title']).rename(columns =lambda x: 'Title_'+ str(x))], axis =1)
        
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Names_scaled'] = scaler.fit_transform(df['Names'])
        
    if keep_bins:
        df['Title_id'] = pd.factorize(df['Title'])[0]+1
        
    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Title_id_scaled'] = scaler.fit_transform(df['Title_id'])

In [11]:
def processDrops():
    global df
    #features from rawdata
    rawdroplist = ['Name', 'Names', 'Title', 'Sex', 'SibSp', 'Parch', 'Pclass', 'Embarked', 'Cabin', 'CabinLetter', \
                  'CabinNumber', 'Age', 'Fare', 'Ticket', 'TicketNumber']
    stringsdroplist = ['Title', 'Name', 'Cabin', 'Ticket', 'Sex', 'TicketNumber']
    
    if not keep_raw:
        df.drop(rawdroplist, axis=1, inplace=True)
    elif not keep_strings:
        df.drop(stringsdroplist, axis=1, inplace=True)
    

In [12]:
def GetDataSets(binary=False, bins=False, scaled=False, strings=False, raw=True, pca=False, balanced=False):
    input_df = pd.read_csv('rawdata/train.csv', header=0)
    submit_df = pd.read_csv('rawdata/test.csv', header=0)
    
    global keep_binary, keep_bins, keep_scaled, keep_raw, keep_strings, df
    keep_binary = binary
    keep_bins = bins
    keep_scaled = scaled
    keep_raw = raw
    keep_strings = strings
    #will combine both test and train data into one
    df = pd.concat([input_df, submit_df]) 
    
    #re-number combined data-set
    #reindex remaining columns
    df.reset_index()
    df = df.reindex_axis(input_df.columns, axis=1)
    
    processCabin()
    processTicket()
    processName()
    processFare()    
    processEmbarked()    
    processFamily()
    processSex()
    processPClass()
    processAge()
    processDrops()
    
    #moved survived to first position
    columns_list = list(df.columns.values)
    columns_list.remove('Survived')
    new_col_list = list(['Survived'])
    new_col_list.extend(columns_list)
    df = df.reindex(columns=new_col_list)
    
    print "Starting with", df.columns.size, "manually generated features...\n", df.columns.values
    
    #find correlation (use spearman b/c relationship is not neccessarily linear)
    
    df_corr = df.drop(['Survived', 'PassengerId'],axis=1).corr(method='spearman')
    #mask is created to ignore correlation with itself
    mask = np.ones(df_corr.columns.size) - np.eye(df_corr.columns.size)
    df_corr = mask * df_corr
    
    drops = []
    # loop through each variable
    for col in df_corr.columns.values:
        # if we've already determined to drop the current variable, continue
        if np.in1d([col],drops):
            continue
        
        # find all the variables that are highly correlated with the current variable 
        # and add them to the drop list 
        corr = df_corr[abs(df_corr[col]) > 0.98].index
        #print col, "highly correlated with:", corr
        drops = np.union1d(drops, corr)
    
    print "\nDropping", drops.shape[0], "highly correlated features...\n" #, drops
    df.drop(drops, axis=1, inplace=True)
    

    # Split the data sets apart again, perform PCA/clustering/class balancing
    
    input_df = df[:input_df.shape[0]] 
    submit_df  = df[input_df.shape[0]:]
    
    if pca:
        print "reducing and clustering now..."
        input_df, submit_df = reduceAndCluster(input_df, submit_df)
    else:
        # drop the empty 'Survived' column for the test set that was created during set concatentation
        submit_df.drop('Survived', axis=1, inplace=1)
    
    print "\n", input_df.columns.size, "initial features generated...\n" #, input_df.columns.values
    
    if balanced:
        # Undersample training examples of passengers who did not survive
        print 'Perished data shape:', input_df[input_df.Survived==0].shape
        print 'Survived data shape:', input_df[input_df.Survived==1].shape
        perished_sample = rd.sample(input_df[input_df.Survived==0].index, input_df[input_df.Survived==1].shape[0])
        input_df = pd.concat([input_df.ix[perished_sample], input_df[input_df.Survived==1]])
        input_df.sort(inplace=True)
        print 'New even class training shape:', input_df.shape
    
    return input_df, submit_df


In [13]:
def reduceAndCluster(input_df, submit_df, clusters=3):
    #Takes the train and test data frames and performs dimensionality reduction with PCA and clustering
    # join the full data together
    
    df = pd.concat([input_df, submit_df])
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    df = df.reindex_axis(input_df.columns, axis=1)
    
    # Series of labels
    survivedSeries = pd.Series(df['Survived'], name='Survived')
    
    print df.head()
    
    # Split into feature and label arrays
    X = df.values[:, 1::]
    y = df.values[:, 0]
    
    print X[0:5]
    
    
    # Minimum percentage of variance we want to be described by the resulting transformed components
    variance_pct = .99
    
    # Create PCA object
    pca = PCA(n_components=variance_pct)
    
    # Transform the initial features
    X_transformed = pca.fit_transform(X,y)
    
    # Create a data frame from the PCA'd data
    pcaDataFrame = pd.DataFrame(X_transformed)
    
    print pcaDataFrame.shape[1], " components describe ", str(variance_pct)[1:], "% of the variance"
    
    # use basic clustering to group similar examples and save the cluster ID for each example in train and test
    kmeans = KMeans(n_clusters=clusters, random_state=np.random.RandomState(4), init='random')
    # # Perform clustering on labeled AND unlabeled data
    # clusterIds = kmeans.fit_predict(X_pca)
    #==============================================================================================================
    
    # Perform clustering on labeled data and then predict clusters for unlabeled data
    trainClusterIds = kmeans.fit_predict(X_transformed[:input_df.shape[0]])
    print "clusterIds shape for training data: ", trainClusterIds.shape
    #print "trainClusterIds: ", trainClusterIds
     
    testClusterIds = kmeans.predict(X_transformed[input_df.shape[0]:])
    print "clusterIds shape for test data: ", testClusterIds.shape
    #print "testClusterIds: ", testClusterIds
     
    clusterIds = np.concatenate([trainClusterIds, testClusterIds])
    print "all clusterIds shape: ", clusterIds.shape
    #print "clusterIds: ", clusterIds
    
    
    # construct the new DataFrame comprised of "Survived", "ClusterID", and the PCA features
    clusterIdSeries = pd.Series(clusterIds, name='ClusterId')
    df = pd.concat([survivedSeries, clusterIdSeries, pcaDataFrame], axis=1)
    
    # split into separate input and test sets again
    input_df = df[:input_df.shape[0]]
    submit_df = df[input_df.shape[0]:]
    submit_df.reset_index(inplace=True)
    submit_df.drop('index', axis=1, inplace=True)
    submit_df.drop('Survived', axis=1, inplace=1)
    
    return input_df, submit_df


In [14]:
if __name__=='__main__':
    train, test = GetDataSets(bins=True, scaled=True, binary=True)
    drop_list =['PassengerId']
    train.drop(drop_list, axis=1, inplace=1)
    test.drop(drop_list, axis =1, inplace=1)
    
    train, test = reduceAndCluster(train, test)
    
    print "Labeled survived counts :\n", pd.value_counts(train['Survived'])/train.shape[0]
    print "Labeled cluster counts :\n", pd.value_counts(train['ClusterId'])/train.shape[0]
    print "Unlabeled cluster counts:\n", pd.value_counts(test['ClusterId'])/test.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  "got %s" % (estimator, X.dtype))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to

Starting with 105 manually generated features...
['Survived' 'PassengerId' 'Pclass' 'Age' 'SibSp' 'Parch' 'Fare' 'Embarked' 'CabinLetter' 'CabinLetter_0' 'CabinLetter_1' 'CabinLetter_2' 'CabinLetter_3'
 'CabinLetter_4' 'CabinLetter_5' 'CabinLetter_6' 'CabinLetter_7' 'CabinLetter_8' 'CabinNumber' 'CabinNumber_scaled' 'TicketPrefixID' 'TicketPrefix_A'
 'TicketPrefix_AQ' 'TicketPrefix_AS' 'TicketPrefix_C' 'TicketPrefix_CA' 'TicketPrefix_CASOTON' 'TicketPrefix_FA' 'TicketPrefix_FC' 'TicketPrefix_FCC'
 'TicketPrefix_LINE' 'TicketPrefix_LP' 'TicketPrefix_PC' 'TicketPrefix_PP' 'TicketPrefix_PPP' 'TicketPrefix_SC' 'TicketPrefix_SCA' 'TicketPrefix_SCAH'
 'TicketPrefix_SCOW' 'TicketPrefix_SCPARIS' 'TicketPrefix_SOC' 'TicketPrefix_SOP' 'TicketPrefix_SOPP' 'TicketPrefix_SOTONO' 'TicketPrefix_SOTONOQ'
 'TicketPrefix_SP' 'TicketPrefix_SWPP' 'TicketPrefix_U' 'TicketPrefix_WC' 'TicketPrefix_WEP' 'TicketNumberDigits' 'TicketNumberStart' 'TicketNumber_scaled'
 'Names' 'Title_Dr' 'Title_Lady' 'Title_Mast

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


  components describe  .99 % of the variance
clusterIds shape for training data:  (891,)
clusterIds shape for test data:  (418,)
all clusterIds shape:  (1309,)
Labeled survived counts :
0    0.6162
1    0.3838
Name: Survived, dtype: float64
Labeled cluster counts :
0    0.8249
1    0.1526
2    0.0224
Name: ClusterId, dtype: float64
Unlabeled cluster counts:
0    0.8373
1    0.1196
2    0.0431
Name: ClusterId, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
