<a href="https://colab.research.google.com/github/EMMA-max-bit/AI-Engineering-Regression-Analysis/blob/main/Techcrush_Diploma_AI_Engineering_Program_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split #Training data (used to teach the model), Testing data (used to check how well the model learned)
from sklearn.linear_model import LogisticRegression  #Predict yes/no, 0/1, or true/false outcomes
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score

In [None]:
# load data
df = pd.read_csv('TITANIC.csv')

In [None]:
# find the shape or dimension of the data
df.shape

(418, 12)

In [None]:
# structure of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [None]:
# check for null values
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1


In [None]:
# Any missing Age values are replaced with the median age
df['Age'].fillna(df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [None]:
# Any missing Fare values are replaced with the median Fare value. for numerical missing values
df['Fare'].fillna(df['Fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)


In [None]:
# handling missing data, but for text (categorical) data instead of numbers.
df['Cabin'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('Unknown', inplace=True)


In [None]:
# Deletes any row that has at least one missing value (NaN)
df = df.dropna()

In [None]:
# find the dimension of the data
df.shape

(418, 12)

In [None]:
# retrieve the first five rows of the data
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Unknown,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Unknown,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Unknown,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Unknown,S


In [None]:
from sklearn.preprocessing import LabelEncoder # LabelEncoder turns categories (words) into integers.
le = LabelEncoder()
df['Sex_Encoded'] = le.fit_transform(df['Sex']) # Converts the Sex column into numbers and stores it in a new column
df['Cabin_Encoded'] = le.fit_transform(df['Cabin']) # Converts Cabin values (like 76, Unknown) into numbers.
df['Embarked_Encoded'] = le.fit_transform(df['Embarked']) # Converts Embarked values (C=0, Q=1, S=2) into numbers.

In [None]:
# Show only the encoded columns for Sex, Cabin, and Embarked.
df[['Sex_Encoded', 'Cabin_Encoded', 'Embarked_Encoded']]

Unnamed: 0,Sex_Encoded,Cabin_Encoded,Embarked_Encoded
0,1,76,1
1,0,76,2
2,1,76,1
3,1,76,2
4,0,76,2
...,...,...,...
413,1,76,2
414,0,22,0
415,1,76,2
416,1,76,2


In [None]:
# instead of turning “male/female” into 0 and 1, creates columns for each category
df_sex = pd.get_dummies(df['Sex'])

In [None]:
df_sex = pd.get_dummies(df['Sex']).astype(int)

In [None]:
# select sex only
df_sex

Unnamed: 0,female,male
0,0,1
1,1,0
2,0,1
3,0,1
4,1,0
...,...,...
413,0,1
414,1,0
415,0,1
416,0,1


In [None]:
# Each embarkation port becomes its own column, marked with 1 if true and 0 if false
df_embarked = pd.get_dummies(df['Embarked']).astype(int)

In [None]:
# select column for embarked
df_embarked

Unnamed: 0,C,Q,S
0,0,1,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,0,1
...,...,...,...
413,0,0,1
414,1,0,0
415,0,0,1
416,0,0,1


In [None]:
# Turn Embarked into numeric columns inside df, then make them clean integers for machine learning.
# The 'Embarked' column was already converted to numerical values in a previous step, so get_dummies is not needed here.
# If one-hot encoding is desired, the previous numerical mapping of 'Embarked' should be removed first.
# df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')
# embarked_cols = [col for col in df.columns if col.startswith('Embarked_')]
# df[embarked_cols] = df[embarked_cols].astype(int)

In [None]:
df[embarked_cols]

Unnamed: 0,Embarked_Encoded,Embarked_C,Embarked_Q,Embarked_S
0,1,0,1,0
1,2,0,0,1
2,1,0,1,0
3,2,0,0,1
4,2,0,0,1
...,...,...,...,...
413,2,0,0,1
414,0,1,0,0
415,2,0,0,1
416,2,0,0,1


In [None]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})

In [None]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Sex_Encoded', 'Cabin_Encoded',
       'Embarked_Encoded', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [None]:
df[['Embarked_Encoded', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

Unnamed: 0,Embarked_Encoded,Embarked_C,Embarked_Q,Embarked_S
0,1,0,1,0
1,2,0,0,1
2,1,0,1,0
3,2,0,0,1
4,2,0,0,1
...,...,...,...,...
413,2,0,0,1
414,0,1,0,0
415,2,0,0,1
416,2,0,0,1


In [None]:
# df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [None]:
X = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [None]:
y = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [None]:
print("Precision:", precision_score(y_test, y_pred))

Precision: 1.0


In [None]:
print(
    "Recall:", recall_score(y_test, y_pred)
)

Recall: 1.0


In [None]:
print("F1_score", f1_score(y_test, y_pred))

F1_score 1.0


In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

array([[50,  0],
       [ 0, 34]])