## Mounting Colab on my Google Drive

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
%cd "code here"

/content/drive/MyDrive/Tutorials/Tutorial 1


Checking the contents of the current directory

In [3]:
!ls

 Titanic_Data.csv  'Tutorial1 Solution.ipynb'  'Tutorial1 Template.ipynb'


# Tutorial 1 The Titanic Disaster

## Project Summary
Predict which passengers survived the Titanic shipwreck.

## Importing Some Basic Libraries

In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt

## Importing the Dataset

In [2]:
dataset = pd.read_csv('Titanic_Data.csv')

## Showing the Dataset in a Table

In [3]:
dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,No
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Yes
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,Yes
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,Yes
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,No


## A Quick Review of the Data

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   PassengerId                        891 non-null    int64  
 1   Pclass                             891 non-null    int64  
 2   Name                               891 non-null    object 
 3   Sex                                891 non-null    object 
 4   Age                                714 non-null    float64
 5   Number of Siblings/Spouses Aboard  891 non-null    int64  
 6   Number of Parents/Children Aboard  891 non-null    int64  
 7   Fare                               891 non-null    float64
 8   Embarked                           889 non-null    object 
 9   Survived                           891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


## Encoding Categorical Data

### Encoding the Input Data

**For the Sex column**

In [5]:
gender = {"male": 0.0, "female": 1.0}
dataset['Sex'] = dataset['Sex'].map(gender)
dataset['Sex'].value_counts()

0.0    577
1.0    314
Name: Sex, dtype: int64

For the 'embarked' column

In [6]:
ports = {'S': 0.0, 'C': 1.0, 'Q': 2.0}
dataset['Embarked'] = dataset['Embarked'].map(ports)
dataset['Embarked'].value_counts()

0.0    644
1.0    168
2.0     77
Name: Embarked, dtype: int64

### Encoding the Output Data (Labels)

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

dataset['Survived'] = le.fit_transform(dataset['Survived'])

In [8]:
dataset['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Dropping Irrelevant Input Data 

In [9]:
df = dataset.drop(['PassengerId', 'Name'], axis=1)
# dataset.drop(['PassengerId', 'Name'], axis=1, inplace=True)

## Checking the Preprocessed Dataset

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Pclass                             891 non-null    int64  
 1   Sex                                891 non-null    float64
 2   Age                                714 non-null    float64
 3   Number of Siblings/Spouses Aboard  891 non-null    int64  
 4   Number of Parents/Children Aboard  891 non-null    int64  
 5   Fare                               891 non-null    float64
 6   Embarked                           889 non-null    float64
 7   Survived                           891 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 55.8 KB


## Seperate the input and output

In [11]:
x = df.iloc[:, 0:7]
y = df.iloc[:, -1]

## Showing the Input Data in a Table Format

In [12]:
x.head()

Unnamed: 0,Pclass,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked
0,3,0.0,22.0,1,0,7.25,0.0
1,1,1.0,38.0,1,0,71.2833,1.0
2,3,1.0,26.0,0,0,7.925,0.0
3,1,1.0,35.0,1,0,53.1,0.0
4,3,0.0,35.0,0,0,8.05,0.0


## A Quick Check of the Output Data

In [13]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Taking Care of Missing Data Inputs

In [14]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [15]:
x = imputer.fit_transform(x)

## Splitting the Dataset into the Training Set and Test Set

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [27]:
x_train.shape

(712, 7)

## Scaling the Features

In [30]:
# Scale the age and fare
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x[:, [2,5]] = sc.fit_transform(x[:, [2,5]])

## Training and Testing Predictive Models

In [31]:
from sklearn.metrics import accuracy_score

In [34]:
# Support vector machine
from sklearn.svm import SVC
sv_classfier = SVC(kernel = 'rbf')
sv_classfier.fit(x_train, y_train)
Y_pred = sv_classfier.predict(x_test)
print(accuracy_score(y_test, Y_pred))

0.6536312849162011


In [35]:
from sklearn.linear_model import LogisticRegression
# Linear regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
Y_pred = lr.predict(x_test)
print(accuracy_score(y_test, Y_pred))

0.7988826815642458
