In [1]:
import numpy as np
import pandas as pd

try:
    from sklearn.cross_validation import train_test_split
except:
    from sklearn.model_selection import train_test_split
    print('Model selection')

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

Model selection


In [2]:
df = pd.read_csv('titanic.csv')
df = df.drop(['PassengerId','Name','Ticket','Cabin','Survived'],axis = 1)
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Sex'] = df['Sex'].map({'female':0,'male':1})

def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data
dummy_columns = ['Pclass','Embarked']
df=dummy_data(df, dummy_columns)

X = df.drop('Fare', axis = 1).values
y = df['Fare'].values


In [3]:
X = df.drop('Fare', axis = 1).values
y = df['Fare'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print('Training Data:',X_train.shape[0])
print('Testing Data:',X_test.shape[0])

Training Data: 623
Testing Data: 268


In [8]:
df

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,22.000000,1,0,7.2500,0,0,1,0,0,1
1,0,38.000000,1,0,71.2833,1,0,0,1,0,0
2,0,26.000000,0,0,7.9250,0,0,1,0,0,1
3,0,35.000000,1,0,53.1000,1,0,0,0,0,1
4,1,35.000000,0,0,8.0500,0,0,1,0,0,1
5,1,29.699118,0,0,8.4583,0,0,1,0,1,0
6,1,54.000000,0,0,51.8625,1,0,0,0,0,1
7,1,2.000000,3,1,21.0750,0,0,1,0,0,1
8,0,27.000000,0,2,11.1333,0,0,1,0,0,1
9,0,14.000000,1,0,30.0708,0,1,0,1,0,0


In [4]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

slr = LinearRegression()
slr.fit(X_train_std, y_train)

y_train_pred = slr.predict(X_train_std)
y_test_pred = slr.predict(X_test_std)

print('(MSE) train: %.2f, test: %.2f'%(mean_squared_error(y_train,y_train_pred), mean_squared_error(y_test,y_test_pred)))
print('(R^2) train: %.2f, test: %.2f'%(r2_score(y_train,y_train_pred), r2_score(y_test,y_test_pred)))

(MSE) train: 1327.93, test: 1576.21
(R^2) train: 0.43, test: 0.44


In [8]:
lin_regr = LinearRegression()
quad_regr = LinearRegression()
cubic_regr = LinearRegression()

quadratic = PolynomialFeatures(degree=2)
cubic = PolynomialFeatures(degree=3)

X_quad_train = quadratic.fit_transform(X_train_std)
X_quad_test = quadratic.fit_transform(X_test_std)
X_cubic_train = cubic.fit_transform(X_train_std)
X_cubic_test = cubic.fit_transform(X_test_std)

lin_regr.fit(X_train_std, y_train)
linear_r2_train = r2_score(y_train, lin_regr.predict(X_train_std))
linear_r2_test = r2_score(y_test, lin_regr.predict(X_test_std))

quad_regr.fit(X_quad_train, y_train)
quad_r2_train = r2_score(y_train, quad_regr.predict(X_quad_train))
quad_r2_test = r2_score(y_test, quad_regr.predict(X_quad_test))

cubic_regr.fit(X_cubic_train,y_train)
cubic_r2_train =  r2_score(y_train, cubic_regr.predict(X_cubic_train))
cubic_r2_test = r2_score(y_test, cubic_regr.predict(X_cubic_test))

print('(R^2) train:\ndegree=1: %.2f, degree=2: %.2f, degree=3: %.2f'%(linear_r2_train, quad_r2_train, cubic_r2_train))
print('(R^2) test:\ndegree=1: %.2f, degree=2: %.2f, degree=3: %.2f'%(linear_r2_test, quad_r2_test, cubic_r2_test))

(R^2) train:
degree=1: 0.43, degree=2: 0.53, degree=3: 0.59
(R^2) test:
degree=1: 0.44, degree=2: 0.53, degree=3: -5921983708786483200.00


In [23]:
ridgeReg = Ridge(alpha=0.01, normalize=True)
ridgeReg.fit(X_cubic_train, y_train)

y_train_pred = ridgeReg.predict(X_cubic_train)
y_test_pred = ridgeReg.predict(X_cubic_test)

print('(MSE) train: %.2f, test: %.2f'%(mean_squared_error(y_train,y_train_pred), mean_squared_error(y_test,y_test_pred)))
print('(R^2) train: %.2f, test: %.2f'%(r2_score(y_train,y_train_pred), r2_score(y_test,y_test_pred)))

(MSE) train: 952.83, test: 1526.89
(R^2) train: 0.59, test: 0.45


In [21]:
lassoReg = Lasso(alpha=0.01, normalize=True)
lassoReg.fit(X_cubic_train, y_train)

y_train_pred = lassoReg.predict(X_cubic_train)
y_test_pred = lassoReg.predict(X_cubic_test)

print('(MSE) train: %.2f, test: %.2f'%(mean_squared_error(y_train,y_train_pred), mean_squared_error(y_test,y_test_pred)))
print('(R^2) train: %.2f, test: %.2f'%(r2_score(y_train,y_train_pred), r2_score(y_test,y_test_pred)))

(MSE) train: 987.19, test: 1380.95
(R^2) train: 0.58, test: 0.51


  positive)
