# Question 02

In [30]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the penguins dataset
df = sns.load_dataset("penguins")
df.dropna(inplace=True)

# Filter rows for 'Adelie' and 'Chinstrap' classes
selected_classes = ['Adelie', 'Chinstrap']
df_filtered = df[df['species'].isin(selected_classes)].copy()

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the species column
y_encoded = le.fit_transform(df_filtered['species'])
df_filtered['class_encoded'] = y_encoded

In [31]:
print(df_filtered)

       species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0       Adelie  Torgersen            39.1           18.7              181.0   
1       Adelie  Torgersen            39.5           17.4              186.0   
2       Adelie  Torgersen            40.3           18.0              195.0   
4       Adelie  Torgersen            36.7           19.3              193.0   
5       Adelie  Torgersen            39.3           20.6              190.0   
..         ...        ...             ...            ...                ...   
215  Chinstrap      Dream            55.8           19.8              207.0   
216  Chinstrap      Dream            43.5           18.1              202.0   
217  Chinstrap      Dream            49.6           18.2              193.0   
218  Chinstrap      Dream            50.8           19.0              210.0   
219  Chinstrap      Dream            50.2           18.7              198.0   

     body_mass_g     sex  class_encoded  
0        

In [32]:
# Display the filtered and encoded DataFrame
print(df_filtered[['species', 'class_encoded']])

# Split the data into features (X) and target variable (y)
y = df_filtered['class_encoded']  # Target variable
# X = df_filtered.drop(['class_encoded'], axis=1)  # Features

# After encoding the target, select only NUMERICAL columns for features
X = df_filtered.select_dtypes(include=['number']).drop(['class_encoded'], axis=1)

       species  class_encoded
0       Adelie              0
1       Adelie              0
2       Adelie              0
4       Adelie              0
5       Adelie              0
..         ...            ...
215  Chinstrap              1
216  Chinstrap              1
217  Chinstrap              1
218  Chinstrap              1
219  Chinstrap              1

[214 rows x 2 columns]


In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train the logistic regression model using 'saga' solver
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, y_train)

# Predict on the test data
y_pred = logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Learned weights (coefficients):", logreg.coef_)
print("Intercept:", logreg.intercept_)

Accuracy: 0.5813953488372093
Learned weights (coefficients): [[ 2.76147463e-03 -8.21753401e-05  4.67173456e-04 -2.86833709e-04]]
Intercept: [-8.43902417e-06]




In [34]:
# Train with 'liblinear' solver
logreg_liblinear = LogisticRegression(solver='liblinear', random_state=42)
logreg_liblinear.fit(X_train, y_train)
y_pred_liblinear = logreg_liblinear.predict(X_test)
accuracy_liblinear = accuracy_score(y_test, y_pred_liblinear)
print(f"Accuracy with 'liblinear': {accuracy_liblinear:.4f}")

Accuracy with 'liblinear': 1.0000


In [35]:
from sklearn.preprocessing import StandardScaler

# Create a scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train with 'saga' on scaled data
logreg_saga_scaled = LogisticRegression(
    solver='saga', max_iter=1000, random_state=42)
logreg_saga_scaled.fit(X_train_scaled, y_train)
y_pred_saga_scaled = logreg_saga_scaled.predict(X_test_scaled)
accuracy_saga_scaled = accuracy_score(y_test, y_pred_saga_scaled)

# Train with 'liblinear' on scaled data
logreg_liblinear_scaled = LogisticRegression(
    solver='liblinear', random_state=42)
logreg_liblinear_scaled.fit(X_train_scaled, y_train)
y_pred_liblinear_scaled = logreg_liblinear_scaled.predict(X_test_scaled)
accuracy_liblinear_scaled = accuracy_score(y_test, y_pred_liblinear_scaled)

print(f"Accuracy with 'saga' (scaled): {accuracy_saga_scaled:.4f}")
print(f"Accuracy with 'liblinear' (scaled): {accuracy_liblinear_scaled:.4f}")

Accuracy with 'saga' (scaled): 0.9767
Accuracy with 'liblinear' (scaled): 0.9767
