In [None]:
#https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# 1.1 Import the Titanic dataset from the CSV file.
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/PDAI LIthan/titanic.csv')

# 1.2 Perform initial data checks to identify the number of rows and columns in the dataset.
# 1.3 Identify and display the count of null values in the Age and cabin columns.
df.info()

# 2.1 Fill the missing values in the 'Age' column using the mean value.
# 2.2 Fill the missing values in the 'Fare' column using the median value.
# 2.3 Fill the missing values in the 'Embarked' column with the most common value ('S').
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df['Embarked'].fillna('S', inplace=True)

# 3.1 Convert the 'Age' column to an integer type.
df['Age'] = df['Age'].astype(int)
# 3.2 Create a new binary feature 'Cabin_Exist' indicating the presence or absence of cabin information.
df['Cabin_Exist'] = df['Cabin'].isnull()
# 3.3 Group the 'Age' and 'Fare' columns into quartiles, creating new features 'Age_Group' and 'Fare_Range'.
df['Age_Group'] = pd.qcut(df['Age'], q=4, labels=False)
df['Fare_Range'] = pd.qcut(df['Fare'], q=4, labels=False)
# 3.4 Create a 'Family' feature by combining 'Parch' and 'SibSp'.
df['Family'] = df['SibSp'] + df['Parch']
# 3.5 Perform feature selection by dropping irrelevant columns.
df.drop(['Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)

# Data encoding
label_y = df['Survived']
df.drop("Survived", axis=1, inplace=True)
x_train, x_test, y_train, y_test = train_test_split(df, label_y, test_size=0.30, random_state=0)

# Encode categorical data into binary form using one-hot encoding.
x_train_encoded = pd.get_dummies(x_train, drop_first=True)
x_test_encoded = pd.get_dummies(x_test, drop_first=True)

# Task 5: Data Scaling
scaler = MinMaxScaler()
train_columns = list(x_train_encoded.columns)

# Fit scaler to the training data and transform both training and test data
x_train_scaled = scaler.fit_transform(x_train_encoded)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=train_columns)
x_test_scaled = scaler.transform(x_test_encoded)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=train_columns)

# Task 6: Model Training and Evaluation
lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)
predictions = lr.predict(x_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print('Accuracy: {:.2f}'.format(accuracy))

# Calculate AUC
y_scores = lr.predict_proba(x_test_scaled)
auc = roc_auc_score(y_test, y_scores[:, 1])
print('AUC: {:.2f}'.format(auc))