# APP WEB WITH STREAMLIT - Working in group

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import os

### 0. Data ingestion

In [2]:
titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
try:
    df = pd.read_csv(titanic_url)
    print("Titanic dataset loaded successfully.")
    print("DataFrame head:")
    print(df.head())
    print("\nDataFrame info:")
    df.info()
    print("\nSurvival (target) value counts:")
    print(df["Survived"].value_counts())
except Exception as e:
    print(f"Error loading Titanic dataset: {e}")
    print("Please ensure the URL is correct or check your internet connection.")

Titanic dataset loaded successfully.
DataFrame head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4 

### 1. Basic EDA and Preprocesing

In [3]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
print("Dropped 'PassengerId', 'Name', 'Ticket', 'Cabin' columns.")
df['Age'].fillna(df['Age'].median(), inplace = True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
print("Handled missing values in 'Age' (median) and 'Embarked' (mode).")
print('\nMissing values after handling:')
print(df.isnull().sum())
X = df.drop('Survived', axis = 1)
y = df['Survived']
print(f'\nShape of X: {X.shape}')
print(f'\nShape of y: {y.shape}')
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Pclass', 'Sex', 'Embarked']
numerical_transformer = Pipeline(steps = [
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])
categorical_transformer = Pipeline(steps = [
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])
preprocessor = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
print("\nPreprocessing pipeline defined.")

Dropped 'PassengerId', 'Name', 'Ticket', 'Cabin' columns.
Handled missing values in 'Age' (median) and 'Embarked' (mode).

Missing values after handling:
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

Shape of X: (891, 7)

Shape of y: (891,)

Preprocessing pipeline defined.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
