In [3]:
#Downloading the Titanic dataset from Kaggle and save it to orig_df, a copy is provided with this exercise and you can upload it to your drive folder.
import pandas as pd
df = pd.read_csv('titanic3.csv')
orig_df = df

In [4]:
#Explore the Titanic dataset
#Print all samples and check how many samples and features the Titanic dataset has
print(orig_df)

# The Titanic dataset contains passenger details such as age, sex, class, fare, survival status, and other information.

      pclass  survived                                             name  \
0          1         1                    Allen, Miss. Elisabeth Walton   
1          1         1                   Allison, Master. Hudson Trevor   
2          1         0                     Allison, Miss. Helen Loraine   
3          1         0             Allison, Mr. Hudson Joshua Creighton   
4          1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
...      ...       ...                                              ...   
1304       3         0                             Zabour, Miss. Hileni   
1305       3         0                            Zabour, Miss. Thamine   
1306       3         0                        Zakarian, Mr. Mapriededer   
1307       3         0                              Zakarian, Mr. Ortin   
1308       3         0                               Zimmerman, Mr. Leo   

         sex    age  sibsp  parch  ticket      fare    cabin embarked boat  \
0     female  29.00  

In [5]:
#Check the type of each variable (data type)
#int type is ok, float64 is ok as well but you may change it to int, object type need to be changed to int (object is a string in pandas and perform a string operation)
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [6]:
#Use shape attribute to check the raws (samples) and columns (features)
df.shape

(1309, 14)

In [7]:
#Use describe attribute to explore the data statistics
#Can you let me a little bit about the data, for example the age groups
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,0.381971,29.881138,0.498854,0.385027,33.295479,160.809917
std,0.837836,0.486055,14.413493,1.041658,0.86556,51.758668,97.696922
min,1.0,0.0,0.17,0.0,0.0,0.0,1.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0


In [33]:
#Use describe attribute at different location to explore the data statistics
#Use 3 or 4 instead of 2 to include more features (this is useful when you have lots of features)
df.describe().iloc[:,:4]

Unnamed: 0,pclass,survived,sex,age
count,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,0.0,29.503186
std,0.837836,0.486055,0.0,12.905241
min,1.0,0.0,0.0,0.17
25%,2.0,0.0,0.0,22.0
50%,3.0,0.0,0.0,28.0
75%,3.0,1.0,0.0,35.0
max,3.0,1.0,0.0,80.0


In [9]:
#Use isnull() to find columns or rows with missing values and sum them up to get the total of missing values
#Which features are the leak features?
# a characteristic that contains information about the target variable that would not be available at prediction time.
# survival rate
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [10]:
#We can create a boolean array (a series with True or False to indicate if a row (a sample) has missing data)
#and use it to inspect rows that are missing data
mask = df.isnull()

In [11]:
mask.head()  # rows

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,True,False


In [12]:
#Let's improve the process by using the function any that iterate through each row and return true for any x in the raw = true
mask = df.isnull().any(axis=1)

In [13]:
mask.head()

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [14]:
df[mask].body.head() # check body column

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [15]:
df[mask].age.head() # check age column


0    29.00
1     0.92
2     2.00
3    30.00
4    25.00
Name: age, dtype: float64

In [16]:
df[mask].embarked.head() # check embarked column

0    S
1    S
2    S
3    S
4    S
Name: embarked, dtype: object

In [35]:
#Use the .value_counts method to examine the counts of the values:
df.sex.value_counts(dropna=False) # How many male and female
# Assign dropna to false if you don't want to delete the missing values

sex
0.0    1309
Name: count, dtype: int64

In [18]:
#Use the .value_counts method to examine the counts of the values:
df.embarked.value_counts(dropna=False)

embarked
S      914
C      270
Q      123
NaN      2
Name: count, dtype: int64

In [19]:
#Use the .value_counts method to examine the counts of the values:
df.age.value_counts(dropna=False)

age
NaN      263
24.00     47
22.00     43
21.00     41
30.00     40
        ... 
66.00      1
0.67       1
76.00      1
67.00      1
26.50      1
Name: count, Length: 99, dtype: int64

In [20]:
#Delete raws with high percentage of missing values
df = df.drop(
     columns=[
         "name",
         "ticket",
         "home.dest",
         "boat",
         "body",
         "cabin",
     ]
 )

In [21]:
#Use the attribute describe to check whether you managed to delete the columns
#Compare it with the above df.describe()
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,2.294882,0.381971,29.881138,0.498854,0.385027,33.295479
std,0.837836,0.486055,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,0.17,0.0,0.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958
50%,3.0,0.0,28.0,0.0,0.0,14.4542
75%,3.0,1.0,39.0,1.0,0.0,31.275
max,3.0,1.0,80.0,8.0,9.0,512.3292


In [23]:
#Working with missing data

#Populate age missing values with thier median

df['age'] = df['age'].fillna(df['age'].median())

#Populate embarked missing values with high occurrence value

df['embarked'] = df['embarked'].fillna('S')

# map sex to a numeric type
df.sex = df.sex.map({'male': 1, 'female': 0})

# map embarked to a numeric type
df.embarked = df.embarked.map({'S': 2, 'C': 1, 'Q':0})

#fill any other missing value with 0 (is not good practice but to avoid common error of NaN value still exist)
df.fillna(0,inplace=True)

In [24]:
print(df.sex)

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1304    0.0
1305    0.0
1306    0.0
1307    0.0
1308    0.0
Name: sex, Length: 1309, dtype: float64


In [25]:
#Splitting data into training and testing datasets
from sklearn.model_selection import train_test_split
#Assign survived column (targets) to y
y = df.survived
#Delete survived column from X (samples)
X = df.drop(columns="survived")
#Now split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

In [26]:
#check the y_train (target)
print(y_train)

1182    1
1106    0
558     1
1125    0
793     0
       ..
583     1
332     0
1293    0
1115    0
1104    0
Name: survived, Length: 916, dtype: int64


In [27]:
#check the X_train (samples)
print(X_train)

      pclass  sex   age  sibsp  parch     fare  embarked
1182       3  0.0  21.0      0      0   7.6500       0.0
1106       3  0.0  41.0      0      5  39.6875       0.0
558        2  0.0  18.0      0      2  13.0000       0.0
1125       3  0.0  24.0      0      0   8.0500       0.0
793        3  0.0  47.0      0      0   7.2500       0.0
...      ...  ...   ...    ...    ...      ...       ...
583        2  0.0  40.0      0      0  15.7500       0.0
332        2  0.0  23.0      0      0  10.5000       0.0
1293       3  0.0  28.0      0      0   8.0500       0.0
1115       3  0.0  28.0      0      0   7.7750       0.0
1104       3  0.0  16.0      4      1  39.6875       0.0

[916 rows x 7 columns]


In [28]:
#call the ML algorithm
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [29]:
#Get the predicted and expected
#Can you tell what is predicted and expected values represent?
# predicted = what the model thinks the answer should be
# expected = the actual correct answers from the dataset

#Can you derive the misclassified values (wrong)
# misclassified values are where predicted does not match expected
predicted = clf.predict(X=X_test)
expected = y_test


In [30]:
#Now print the model accuracy
print(f'{clf.score(X_test, y_test):.2%}')
clf.predict(X_test)

68.45%


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,

In [None]:
#Save and test the model
import pickle

# Save the trained model as a pickle string.
saved_model = pickle.dumps(clf)

# Load the pickled model
clf_from_pickle = pickle.loads(saved_model)

# Use the loaded pickled model to make predictions
clf_from_pickle.predict(X_test)

In [None]:
# To save a machine learning model produced by scikit-learn (sklearn), you can use Python's joblib library,
# which is often preferred for saving scikit-learn models due to its efficiency.

#First, ensure you have the joblib library installed. If you don't have it, you can install it using pip:

# pip install joblib

# Once you have your scikit-learn model trained and ready to save, import joblib:
import joblib


# Save the model as a pickle in a file
# You can use the joblib.dump() function. Provide the model and the file path where you want to save it:
joblib.dump(clf, 'filename.pkl')

# Load the model from the file
# Your scikit-learn model is now saved to the specified file with the ".pkl" extension.
# To load the model at a later time for use, you can use joblib.load():
clf_from_joblib = joblib.load('filename.pkl')

# Use the loaded model to make predictions
clf_from_joblib.predict(X_test)

# This process allows you to save and load scikit-learn models efficiently, preserving their state for future use without the need to retrain them.