In [6]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
import seaborn as sns

# Loading the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# Splitting the full dataset into the training and testing datasets
train_data, test_data = train_test_split(titanic_df, test_size=0.3, random_state=42)

# Printing out the shapes of the datasets
print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Train data shape: (623, 15)
Test data shape: (268, 15)


In [7]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import pandas as pd

# Load and preprocess the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# One-hot encode categorical variables using pandas get_dummies
titanic_preprocessed = pd.get_dummies(titanic_df, columns=['sex', 'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], drop_first=True)

# Handle any NaN values by filling them with the mean of the column
titanic_preprocessed = titanic_preprocessed.fillna(titanic_preprocessed.mean())

# Split the preprocessed dataset into the training and testing datasets with a 70%-30% split
train_data, test_data = train_test_split(titanic_preprocessed, test_size=0.3, random_state=42)

# Separate the target variable ("survived") from the rest of the training data
x_train = train_data.drop("survived", axis=1)
y_train = train_data["survived"]

# Initialize a Logistic Regression model
logreg = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence

# Training the Logistic Regression model
logreg.fit(x_train, y_train)

#The test dataset
x_test =test_data.drop("survived", axis=1)
y_test = test_data["survived"]

# Using the model to make predictions on the testing dataset
predictions = logreg.predict(x_test) 

# Displaying metrics
print("Classification Report:")
print(classification_report(y_test, predictions)) 

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))  

print("Accuracy Score:")
print(accuracy_score(y_test, predictions))  

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       157
           1       1.00      1.00      1.00       111

    accuracy                           1.00       268
   macro avg       1.00      1.00      1.00       268
weighted avg       1.00      1.00      1.00       268

Confusion Matrix:
[[157   0]
 [  0 111]]
Accuracy Score:
1.0


In [8]:
WITH MinmaxScaler

SyntaxError: invalid syntax (1216103798.py, line 1)

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import pandas as pd

# Load and preprocess the Titanic dataset
titanic_df = sns.load_dataset('titanic')

# Drop non-numeric columns for simplicity
titanic_df = titanic_df.select_dtypes(include=['float64', 'int64'])

# Handle any NaN values by filling them with the mean of the column
titanic_df = titanic_df.fillna(titanic_df.mean())

# TODO: Use MinMaxScaler to scale the numeric features into a standard range
# Hint: You will need to create an instance of MinMaxScaler, fit it on the data and transform the data
# Separating features and target variable
features = titanic_df.drop("survived", axis=1)
target = titanic_df[["survived"]]

# Scaling the features
scaler = MinMaxScaler()
scaled_features = pd.DataFrame(scaler.fit_transform(features), columns = features.columns)

# Concatenating scaled features with the target variable
titanic_df_scaled = pd.concat([scaled_features, target], axis=1)

# Split the preprocessed dataset into the training and testing datasets with a 70%-30% split
train_data, test_data = train_test_split(titanic_df_scaled, test_size=0.3, random_state=42)

# Separate the target variable ("survived") from the rest of the training data
x_train = train_data.drop("survived", axis=1)
y_train = train_data["survived"]

# Initialize a Logistic Regression model
logreg = LogisticRegression(max_iter=1000)  

# Training the Logistic Regression model
logreg.fit(x_train, y_train)

# Separate the independent (x_test) and dependent (y_test) variables from the testing dataset
x_test = test_data.drop("survived", axis=1)
y_test = test_data["survived"]

# Using the model to make predictions on the testing dataset
predictions = logreg.predict(x_test)

# Displaying metrics
print("Classification Report:")
print(classification_report(y_test, predictions))

print("Accuracy Score:")
print(accuracy_score(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.89      0.78       157
           1       0.74      0.46      0.57       111

    accuracy                           0.71       268
   macro avg       0.72      0.67      0.67       268
weighted avg       0.72      0.71      0.69       268

Accuracy Score:
0.7089552238805971


In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns

# Load the Titanic dataset
titanic_df = sns.load_dataset('titanic') # Assuming 'titanic.csv' is in the working directory

# Drop columns with strings and 'pclass', which is categorical but read as a numeric type
titanic_df = titanic_df.select_dtypes(exclude=['object', 'category'])

# Handle any NaN values by filling them with the mean of the column (ignoring 'pclass', which is categorical)
numeric_columns = titanic_df.columns.drop('pclass')
titanic_df[numeric_columns] = titanic_df[numeric_columns].fillna(titanic_df[numeric_columns].mean())

# Convert 'pclass' to integer type if it's not already
titanic_df['pclass'] = titanic_df['pclass'].astype(int)

# Split the dataset into training and testing sets with a 70%-30% split
train_data, test_data = train_test_split(titanic_df, test_size=0.2, random_state=42)

# Identify and separate the target variable 'survived' from the training and testing data
x_train = train_data.drop("survived", axis=1)
y_train = train_data["survived"]

x_test = test_data.drop("survived", axis=1)
y_test = test_data["survived"]

# Initialize StandardScaler and scale the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Initialize the Logistic Regression model and train it on the scaled training data
logreg = LogisticRegression(max_iter=1000)  
logreg.fit(x_train_scaled, y_train)

# Use the trained model to make predictions on the scaled testing data
predictions = logreg.predict(x_test_scaled)

# Calculate and print the accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, predictions))


Accuracy Score:
0.8044692737430168
