# Task 1: Use an appropriate library to read the data file.

In [None]:
import pandas as pd
df = pd.read_csv('data.csv')
print(df.head())

# Task 2: Remove the following types of data from the dataset:

## 2.1 Invalid Data

In [None]:
df.replace('Unknown', pd.NA, inplace=True)
df = df.dropna()

## 2.2 Null Values

In [None]:
df = df.drop_duplicates(subset='Name', keep='first')
df.head()

## 2.3 Unknown Values

In [None]:
df = df.dropna()
df

# Task 3: Convert categorical values to numerical values.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
if 'Embarked' in df.columns:
    df['Embarked'] = le.fit_transform(df['Embarked'])
df.head()
df = df.drop(columns=['Name', 'Ticket', 'Cabin'])

# Task 4: Choose the appropriate columns from the dataset as the features and target.

In [None]:
features = df.drop('Survived', axis=1)  # Drop the target column
target = df['Survived']  # Target variable
features.head(), target.head()
le = LabelEncoder()
features['Pclass'] = le.fit_transform(features['Pclass'])

# Task 5: Choose at least 2 appropriate machine learning algorithms for predicting the target.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
model1 = RandomForestClassifier(random_state=42)
model2 = LogisticRegression(random_state=42)

# Task 6: Perform the necessary pre-processing on the dataset.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns)

features_scaled_df.head()

# Task 7: Split the dataset into training, validation, and test datasets.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(features_scaled_df, target, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_val: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}") 

# Task 8: Train the machine learning models using the selected algorithms.

In [None]:
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

# Task 9: Evaluate the models using appropriate metrics.

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred1 = model1.predict(X_val)
print("Random Forest Model Evaluation:")
print(confusion_matrix(y_val, y_pred1))
print(classification_report(y_val, y_pred1))
print("Accuracy:", accuracy_score(y_val, y_pred1))

In [None]:
y_pred2 = model2.predict(X_val)
print("\nLogistic Regression Model Evaluation:")
print(confusion_matrix(y_val, y_pred2))
print(classification_report(y_val, y_pred2))
print("Accuracy:", accuracy_score(y_val, y_pred2))

# Task 10: Determine the most suitable model based on your experiment.

In [None]:
best_model = "Random Forest" if accuracy_score(y_val, y_pred1) > accuracy_score(y_val, y_pred2) else "Logistic Regression"
print(f"The most suitable model is: {best_model}")