# Welcome to Machine Learning.

# "ISBE"  The Motto and Main Steps when building a Machine Learning Model. 
## 1. I - Inspect and explore data.
## 2. S - Select and engineer features.
## 3. B - Build and train model.
## 4. E - Evaluate model.

In [None]:
# Import our libraries 

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

sns.set()
import matplotlib.pyplot as plt
%matplotlib inline


# Helper function to split our data
from sklearn.model_selection import train_test_split

# This is our Logit model
from sklearn.linear_model import LogisticRegression

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, f1_score

In [None]:
# Load data
df = pd.read_csv('data/titanic.csv')

# Display data
df.head()

## Data dictionary
<img src='https://miro.medium.com/max/1260/1*rr3UGlpEv_PSMc1pyqa4Uw.png'>


# Inspect and Explore EDA
1. Shape and size
1. Describe
1. Info
1. Check for nulls
1. Check for dupes
1. Plot

In [None]:
# 1. Shape and size
print(df.shape)

In [None]:
# 2. Describe
df.describe()

In [None]:
# 3. Get info on cols
df.info()


## Inspect null values.
* What does this tell us about features we should and should not use. 

In [None]:
# Inspect / check for nulls.
df.isnull().sum() 

In [None]:
# Percentage of null values per columns
((df.isnull().sum() / len(df)) * 100).round(2)

## Inspect duplicate rows.
* Phew...

In [None]:
n_dupes = df.duplicated().sum()
print("Number of duplicate rows are %i." % n_dupes)

# Visualize our data

In [None]:
sns.pairplot(df, hue='survived');

In [None]:
plt.figure(figsize=(13,8))
sns.kdeplot(data=df, x='age', hue='survived', shade=True)

In [None]:
df.groupby('sex')['survived'].sum() / df.groupby('sex')['survived'].count()

# "S" Select and Engineer Features
1. Select the features you are going to want to use to predict survived. 
    * For this first example we are only going to be selecting `fare, sex, and pclass`
    * Don't use features that have nulls in them. 
1. Convert categorical variables into numerical. 
    * Use helper function `pd.get_dummies()` for this 
1. Split into test and train. 

In [None]:
df.head()

In [None]:
df.survived.value_counts()

In [None]:
df.sex.value_counts()

In [None]:
df.pclass.value_counts()

### Convert categorical variables into numerical.
* `pd.get_dummies()` is a very helpful function that converts our categorical variables into continuous variables. 
* have to be careful about the ['dummy variable trap'](https://en.wikipedia.org/wiki/Dummy_variable_(statistics)) which leads to multicollinearity problems which we just dont have time to discuss, [to learn more watch this](https://www.youtube.com/watch?v=Cba9LJ9lS8s&ab_channel=zedstatistics).  git  
* more about [dummy variable traps](https://medium.com/nerd-for-tech/what-is-dummy-variable-trap-how-it-can-be-handled-using-python-78ec17246331)
* Removing one of the dummy variable columns solves this. 


In [None]:
df.head()

In [None]:
pd.get_dummies(df, columns=['sex', 'pclass'], drop_first=True)

In [None]:
df = pd.get_dummies(df, columns=['sex', 'pclass'], drop_first=True)
df.head()

In [None]:
selected_features = ['fare', 'pclass_2', 'pclass_3', 'sex_male']

# Defining our X and y
### y is what we are trying to predict, and X is what we are using to make that prediction.
* It is industry standard to name your feature matrix as `X`, and your target variable as `y`


In [None]:
selected_features = ['fare', 'pclass_2', 'pclass_3', 'sex_male']

X = df[selected_features]

y = df['survived']


### Splitting our data into training and testing batches.


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2)

print('Lenght of our Training data:', X_train.shape, '\nLength of our Testing data:', X_test.shape)

# "B" - Build and train our model
* Initalize an empty model
* Train our model using our `model.fit()` with our training data 

In [None]:
from sklearn.linear_model import LogisticRegression

# Initalize our model.  
# This will create an empty untrained Logistic Regression model.
model = LogisticRegression()

print(type(model))


# This is the heart of our ML process. 
This steps fits (aka trains) our model with our training data.

In [None]:
# !! THIS HAPPENS 'IN PLACE', MEANING IT DOESN'T RETURN ANYTHING !!
model.fit(X=X_train, y=y_train)

In [None]:
# Make new predicitions
y_pred = model.predict(X_test)
y_pred

In [None]:
model.classes_

In [None]:
y_pred_proba = model.predict_proba(X_test)[:,1]
y_pred_proba.round(2)


In [None]:
pred_df = pd.DataFrame.from_dict( 
    {'y_true': y_test, 
     'y_pred': y_pred, 
     'probability': y_pred_proba} )
pred_df

# E = Evaluate our model.

# Accuracy, our first look.
Is the percent of predicitions we got correct.
Good for general scoring, but bad in terms of when classes are imbalanced. 

It is the count of all the predictions you got correct divided by the total number of predictions.
Aka, Percent of predictions we got correct.


Accuracy = (TP + TN) / (TP + TN + FP + FN)


In [None]:
# Helper fuctions to evaluate our model. 
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix 

accuracy = accuracy_score(y_test, y_pred)
# Accuracy Score: 0.826816

print("Accuracy Score: %f" % accuracy)

# Precision
Out of all the times the MODEL says 'yes' what was the precentage it was correct. 
* The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative. 
* The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 
* If you want to raise precision (ie; only say yes when you are absolutely sure), raise your classification threshold.

In [None]:
precision = precision_score(y_test, y_pred)
print("Precision Score: %f" % precision)
print("In other words, when the model predicts someone survived, it is correct %f of the time." % precision)

# Recall
Out of all the times the ACTUAL is 'yes', how many did you get correct.  

Having high recall is important when the cost of missing a True Positive is high.  

For example, if you're detecting cancer.  Saying, you don't have cancer when you really do is really bad. Therefore, if you're building a model to detect cancer, you should optimize for having high recall.  You can do that by lowering your classification threshold. 

* The recall is intuitively the ability of the classifier to find all the positive samples.
* The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. 
* if you want to raise recall, lower your classification threshold

In [None]:
recall = recall_score(y_test, y_pred)
print("Recall Score: %f" % recall)
print("In other words, it correctly identifies %f percent of all survivors" % recall)

# F1 Score
The ‘harmonic mean’ of precision and recall. 

Good for an overall evaluation metric. 


The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. 


F1 = 2 * (precision * recall) / (precision + recall)




In [None]:
from sklearn.metrics import f1_score

# F1 Score
f1_score(y_test, y_pred)

* Confustion Matrix
    * True Positive (TP) - When you say yes and actual is yes.
    * False Positive (FP) - When you say yes and actual is no.
    * False Negative (FN) - When you say no and actual is yes.
    * True Negative (TN) - When you say no and actual is no. 

The first term (True or False) is if the prediction was correct or not. True means correct, False means incorrect.
The second term (Positive or Negative) is what the classifier guessed.  Did it say it Yes, or did it say No. 

True Positive (TP): A true positive is an outcome where the model correctly predicts the positive class. When we say YES survived, actual is YES survived. Having a high True-Positive rate is GOOD.

True Negative (TN): A true negative is an outcome where the model correctly predicts the negative class.  When we say NO survived, actual is NO survived. Having a high true-negative rate is GOOD.

False Negative (FN):  A false negative is an outcome where the model incorrectly predicts the negative class. We say NO survived, actual is YES survived.
Having a high False-Negative rate is BAD.

False Positive (FP):  A false positive is an outcome where the model incorrectly predicts the positive class. We say YES survived, actual is NO survived.
Having a high False-Positive rate is BAD.




# Confusion Matrix 

In [None]:

cm = confusion_matrix(y_test, y_pred)

# Just rounding them so the numbers are easier to read
cm = cm.round(2)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

fig = plt.figure(figsize=(8,8))
ax = sns.heatmap(cm, annot=True, cmap='Greens', fmt='g')
plt.title("Confusion Matrix of Titanic Suvivors")
plt.xlabel('Predicted')
plt.ylabel('Actual')

# print('true-negitive:', tn, 
#       '\nfalse-positive:', fp, 
#       '\nfalse-negative:', fn, 
#       '\ntrue-positive:', tp )

# Now the easy way

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix 

# ACCURACY
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score: %f" % accuracy)

# F1 ACCURACY
f1 = f1_score(y_test, y_pred)
print("F1 Score: %f" % f1)


# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred)
fig = plt.figure(figsize=(5,5))
ax = sns.heatmap(cm, annot=True, cmap='Greens', fmt='g')
plt.ylabel('Ground Truth')
plt.xlabel('Model Prediction');