# Quick dictionary for ml algorithms

## load data

In [2]:
import pandas as pd
# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)

# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
features_raw = full_data.drop('Survived', axis = 1)

# Removing the names
features_no_names = features_raw.drop(['Name'], axis=1)

# One-hot encoding
features = pd.get_dummies(features_no_names)

features = features.fillna(0.0)
display(features.head())

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,3,35.0,0,0,8.05,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Split training and test sets

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.2, random_state=42)

## Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston

# Load the data from the boston house-prices dataset 
boston_data = load_boston()
x = boston_data['data']
y = boston_data['target']

# Make and fit the linear regression model
# TODO: Fit the model and assign it to the model variable
model = LinearRegression()
model.fit(x, y)

# Make a prediction using the model
sample_house = [[2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01,
                6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02,
                1.86000000e+01, 3.94870000e+02, 1.09700000e+01]]
# Predict housing price for the sample_house
prediction = model.predict(sample_house)

# model.predict([X1], [X2] )
# The model returned an array of predictions, one prediction for each input array (multiple features). 

## Decision Trees

In [5]:
# Import statements 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create the decision tree model and assign it to the variable model.
model = DecisionTreeClassifier(max_depth = 6, min_samples_leaf = 5, min_samples_split = 10)

# Fit the model.
model.fit(X_train,y_train)

# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 0.8735955056179775
The test accuracy is 0.8547486033519553


In [8]:
# With Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import ShuffleSplit

cv_sets = ShuffleSplit(n_splits = 10, test_size = 0.20, random_state = 42)

# Define grids
params = {'max_depth': range(3, 8), 'min_samples_leaf': range(3, 10), 'min_samples_split': range(5, 15)}

# Define grid search
grid = GridSearchCV(DecisionTreeClassifier(), params, make_scorer(accuracy_score), cv = cv_sets)

grid.fit(X_train, y_train)

# Making predictions
y_train_pred = grid.predict(X_train)
y_test_pred = grid.predict(X_test)

# Calculate the accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)


The training accuracy is 0.8174157303370787
The test accuracy is 0.7988826815642458


## Naive Bayes 

In [None]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNA()
naive_bayes.fit(training_data, y_train)
predictions = naive_bayes.predict(testing_data)

# model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))