In [None]:
# original dataset
original_data = 'data/penguins_combined.csv'


## unprocessed_df = original unprocess data

In [None]:
import pandas as pd

In [None]:
unprocessed_df = pd.read_csv(original_data)

In [None]:
# proportion of data in sex column
print(unprocessed_df['sex'].value_counts(normalize=False))

In [None]:
#missing values in sex column
print(len(unprocessed_df[unprocessed_df['sex'].isnull()]))

In [None]:
unprocessed_df.head()

In [None]:
unprocessed_df["sex"].value_counts(normalize=False)

In [None]:
unprocessed_df["species"].value_counts()

In [None]:
# missing values in sex column
len(unprocessed_df[unprocessed_df['sex'].isnull()])

In [None]:
# unprocess df with missing values in any columns
unprocessed_df[unprocessed_df.isnull().any(axis=1)] 
print(len(unprocessed_df[unprocessed_df.isnull().any(axis=1)]))


In [None]:
import pandas as pd
fresh_df = unprocessed_df.dropna()
fresh_df.shape

In [None]:
# preprocess the catgorical features
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
categorical_features = ['species', 'island', 'sex']
for col in categorical_features:
    le = LabelEncoder()
    fresh_df[col] = le.fit_transform(fresh_df[col])

In [None]:
fresh_df.head()

In [None]:
#unique values in sex column
fresh_df["sex"].unique()

In [None]:
# remove 0 unique values from sex column
fresh_df = fresh_df[fresh_df["sex"] != 0]
fresh_df.shape

 1 - male 
 2 - female 
 3- dot
 0 - Nan

In [None]:
# Split the dataset into train, val and test sets
from sklearn.model_selection import train_test_split

train_val_df, test_df = train_test_split(fresh_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2
print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}, Test shape: {test_df.shape}")
# Separate features and target variable
X_train = train_df.drop(columns=['species'])
y_train = train_df['species']
X_val = val_df.drop(columns=['species'])
y_val = val_df['species']
X_test = test_df.drop(columns=['species'])
y_test = test_df['species'] 


In [None]:
# Run Random Forest Classifier on original data
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score   

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)
# validation set
print("Validation Set Results:")
y_val_pred = rf.predict(X_val)
print("Random Forest Classifier on Original Data")
print(classification_report(y_val, y_val_pred))
print("Accuracy:", accuracy_score(y_val, y_val_pred))
# area under ROC curve      
from sklearn.metrics import roc_auc_score
y_val_prob = rf.predict_proba(X_val)
roc_auc = roc_auc_score(y_val, y_val_prob, multi_class='ovr')
print("ROC AUC:", roc_auc)

# test set  
print("________________________________")
print("Test Set Results:")
y_pred = rf.predict(X_test)
print("Random Forest Classifier on Original Data")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
# area under ROC curve
from sklearn.metrics import roc_auc_score
y_prob = rf.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("ROC AUC:", roc_auc)




In [None]:
# run histogram-based gradient boosting classifier on original data
from sklearn.ensemble import HistGradientBoostingClassifier
hgb = HistGradientBoostingClassifier(random_state=42)
hgb.fit(X_train, y_train)
# validation set
print("Validation Set Results:")        
y_val_pred_hgb = hgb.predict(X_val)
print("Histogram-based Gradient Boosting Classifier on Original Data")
print(classification_report(y_val, y_val_pred_hgb))
print("Accuracy:", accuracy_score(y_val, y_val_pred_hgb))
# area under ROC curve

y_val_prob_hgb = hgb.predict_proba(X_val)
roc_auc_hgb = roc_auc_score(y_val, y_val_prob_hgb, multi_class='ovr')
print("ROC AUC:", roc_auc_hgb)
# test set
print("---------------------")
print("Test Set Results:")  
y_pred_hgb = hgb.predict(X_test)
print("Histogram-based Gradient Boosting Classifier on Original Data")
print(classification_report(y_test, y_pred_hgb))
print("Accuracy:", accuracy_score(y_test, y_pred_hgb))
# area under ROC curve
y_prob_hgb = hgb.predict_proba(X_test)
roc_auc_hgb = roc_auc_score(y_test, y_prob_hgb, multi_class='ovr')
print("ROC AUC:", roc_auc_hgb)  

In [None]:
# Logistic Regression on original data
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=20000)          

log_reg.fit(X_train, y_train)
# validation set
print("Validation Set Results:")
y_val_pred = log_reg.predict(X_val)
print("Logistic Regression on Original Data")
print(classification_report(y_val, y_val_pred))
print("Accuracy:", accuracy_score(y_val, y_val_pred))
# area under ROC curve      
y_val_prob = log_reg.predict_proba(X_val)       
roc_auc = roc_auc_score(y_val, y_val_prob, multi_class='ovr')
print("ROC AUC:", roc_auc)
# test set  
print("________________________________")
print("Test Set Results:")
y_pred = log_reg.predict(X_test)
print("Logistic Regression on Original Data")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
# area under ROC curve
y_prob = log_reg.predict_proba(X_test)
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
print("ROC AUC:", roc_auc)  