<a href="https://colab.research.google.com/github/FauxGrit/Titanic-Fairness/blob/main/Fairness_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective: Explore Fairness in Titanic Survival Model
 **Objective**: This code provides an iniital exploration for building a model to predict Airbnb prices.  In particular we are interested in learning which features are most important.

**Assumptions**: 
1. N/A

**Open Questions**:  
1. N/A

**Reference Links**
1. N/A

### Check Python Version

In [1]:
import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

Python version
3.8.15 (default, Oct 12 2022, 19:14:39) 
[GCC 7.5.0]
Version info.
sys.version_info(major=3, minor=8, micro=15, releaselevel='final', serial=0)


### Importing Necessary Libraries

In [2]:
# !pip install pycaret
# !pip install xgboost
# !pip install catboost

In [3]:
import warnings, os, xlrd, pickle
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openpyxl

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn import metrics
from sklearn import linear_model, preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder
from sklearn import discriminant_analysis
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
# from catboost import CatBoostClassifier


# from pycaret.classification import *   ## <--- uncomment to run!!!
from geopy.distance import geodesic
from math import cos, sqrt

import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
import plotly.graph_objs as go
import plotly
from plotly import tools

init_notebook_mode(connected=True)
pd.set_option('display.max_columns', 100)

# Supress warnings for cleaner presentation
# Turn off durng development!!!
%matplotlib inline
warnings.filterwarnings('ignore')

# Gather Data
Airbnb data from Kaggle https://www.kaggle.com/ivanovskia1/nyc-airbnb-rental-data-october-2017 .

**Preprocessing prior to data load:**
Significant data cleaning done using OpenRefine.


### Load Data into Notebook
* Load CSV file. It must be in same directory location as Jupyter Notebook (or add path to code).
* Print first five rows to confirm data load.
* Print shape to see how many rows and columns.

#### Merged and Cleaned Data

In [4]:
data_url = 'https://raw.githubusercontent.com/FauxGrit/Titanic-Fairness/main/train.csv'
df0 = pd.read_csv(data_url)

df0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Print shape of dataframe to determine number of rows and columns
print("Cleaned Airbnb Data\nNumber of rows: ", df0.shape[0], "\nNumber of columns: ", df0.shape[1])

Cleaned Airbnb Data
Number of rows:  891 
Number of columns:  12


# Basic Data Exploration

### Numerical Data Basic Stats

#### Clean and Merged Airbnb Data

In [6]:
df0['Senior'] = 0
df0.loc[df0['Age'] > 49, 'Senior'] = 1

df0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Senior
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [7]:
# # Create Deck from 1st letter in Cabin
# df0['Deck'] = df0.Cabin.str.extract(r'([A-Z])?(\d)')[0]

In [8]:
# Fill NaN values
df0['Age'] = df0['Age'].fillna(df0['Age'].mean())
df0['Fare'] = df0['Fare'].fillna(df0['Fare'].mean())
df0['Embarked'] = df0['Embarked'].fillna('Empty')
# df0['Deck'] = df0['Deck'].fillna('Empty')

df0.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Senior
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [9]:
df1 = pd.get_dummies(data=df0, columns=['Sex','Embarked'])
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Senior,Sex_female,Sex_male,Embarked_C,Embarked_Empty,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,0,1,0,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,1,0,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,1,0,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,0,1,0,0,0,1


In [10]:
# Drop 'PassengerId','Name','Ticket','Cabin' since not useful
df1.drop(['PassengerId','Name','Ticket','Cabin', 'Sex_female'], axis=1, inplace=True)
df1.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Senior,Sex_male,Embarked_C,Embarked_Empty,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,0,1,0,0,0
2,1,3,26.0,0,0,7.925,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,0,1


In [11]:
df1.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Senior',
       'Sex_male', 'Embarked_C', 'Embarked_Empty', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [12]:
df1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,891.0,29.699118,13.002015,0.42,22.0,29.699118,35.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Senior,891.0,0.083053,0.276117,0.0,0.0,0.0,0.0,1.0
Sex_male,891.0,0.647587,0.47799,0.0,0.0,1.0,1.0,1.0
Embarked_C,891.0,0.188552,0.391372,0.0,0.0,0.0,0.0,1.0
Embarked_Empty,891.0,0.002245,0.047351,0.0,0.0,0.0,0.0,1.0


In [13]:
#Separate Input & Response Variables
X = df1.iloc[:,df1.columns != 'Survived'].values
y = df1.iloc[:, 0].values

In [14]:
#Split into Training / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [15]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#Random Forrest Classifier

In [16]:
rf = RandomForestClassifier(max_depth=3, random_state=0)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, random_state=0)

In [17]:
# Test Data
metrics.confusion_matrix(y_test, rf.predict(X_test))

array([[104,   6],
       [ 25,  44]])

In [18]:
# Accuracy
rf.score(X_test, y_test)

0.8268156424581006

#XGBoost Classifier

In [19]:
xg = xgb.XGBClassifier()
xg.fit(X_train, y_train)

XGBClassifier()

In [20]:
# Test Data
metrics.confusion_matrix(y_test, xg.predict(X_test))

array([[105,   5],
       [ 23,  46]])

In [21]:
# Accuracy
xg.score(X_test, y_test)

0.8435754189944135

#Gradient Boosting Classifier

In [22]:
GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

GradientBoostingClassifier()

In [23]:
# Test Data
metrics.confusion_matrix(y_test, gb.predict(X_test))

array([[103,   7],
       [ 21,  48]])

In [24]:
# Accuracy
gb.score(X_test, y_test)

0.8435754189944135

#KNN Classifier

In [25]:
KNeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)

KNeighborsClassifier()

In [26]:
# Test Data
metrics.confusion_matrix(y_test, kn.predict(X_test))

array([[96, 14],
       [19, 50]])

In [27]:
# Accuracy
kn.score(X_test, y_test)

0.8156424581005587

In [28]:
X_test[0]

array([ 0.81925059, -0.00282437, -0.46445234, -0.47741019, -0.34739758,
       -0.29499644,  0.72882288,  2.12588331, -0.05307449, -0.31426968,
       -1.62827579])

In [29]:
X[0][5]

0.0

In [30]:
df1.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Senior,Sex_male,Embarked_C,Embarked_Empty,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,0,1
1,1,1,38.0,1,0,71.2833,0,0,1,0,0,0
2,1,3,26.0,0,0,7.925,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,0,1


In [31]:
df1.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Senior',
       'Sex_male', 'Embarked_C', 'Embarked_Empty', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [32]:
df_sr = pd.DataFrame(X_test, columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Senior',
       'Sex_male', 'Embarked_C', 'Embarked_Empty', 'Embarked_Q', 'Embarked_S'])
df_sr = df_sr.loc[df_sr['Senior'] > 0]
df_sr

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Senior,Sex_male,Embarked_C,Embarked_Empty,Embarked_Q,Embarked_S
31,-0.380968,1.549198,-0.464452,-0.47741,-0.426405,3.389871,-1.372075,-0.470393,-0.053074,-0.31427,0.614147
39,-0.380968,1.855002,-0.464452,-0.47741,-0.356545,3.389871,0.728823,-0.470393,-0.053074,-0.31427,0.614147
40,-1.581187,2.313708,0.41271,0.740927,0.94485,3.389871,0.728823,2.125883,-0.053074,-0.31427,-1.628276
49,-1.581187,1.7021,0.41271,0.740927,0.953832,3.389871,0.728823,-0.470393,-0.053074,-0.31427,0.614147
56,-1.581187,2.46661,-0.464452,-0.47741,-0.106047,3.389871,0.728823,-0.470393,-0.053074,-0.31427,0.614147
62,-1.581187,1.855002,0.41271,-0.47741,0.926221,3.389871,-1.372075,2.125883,-0.053074,-0.31427,-1.628276
87,0.819251,2.543061,-0.464452,-0.47741,-0.444619,3.389871,-1.372075,-0.470393,-0.053074,-0.31427,0.614147
98,-1.581187,2.313708,-0.464452,-0.47741,-0.106047,3.389871,0.728823,-0.470393,-0.053074,-0.31427,0.614147
101,-0.380968,1.855002,0.41271,-0.47741,-0.117025,3.389871,0.728823,-0.470393,-0.053074,-0.31427,0.614147
125,-1.581187,2.160806,-0.464452,-0.47741,-0.043173,3.389871,0.728823,2.125883,-0.053074,-0.31427,-1.628276


In [33]:
# metrics.confusion_matrix(y_test, kn.predict(df_sr.to_numpy()))