In [1]:
# 25-nov-2023
# CSC461 – Assignment3 – Machine Learning
# Esha Naeem
# FA20-BSE-057(A)
# A brief description of the task:The task involves employing the Random Forest classification algorithm for gender prediction, utilizing two distinct cross-validation strategies: Monte Carlo and Leave P-Out. In the Monte Carlo approach, the dataset is randomly divided into training and test sets across multiple iterations, and the F1 score is averaged over these iterations. On the other hand, the Leave P-Out strategy involves systematically using P instances as the test set while training the model on the remaining data, repeating this process for all possible test sets. The aim is to evaluate the Random Forest model's performance under both cross-validation techniques and report F1 scores, which strike a balance between precision and recall.
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, LeavePOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [3]:
from google.colab import drive
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/Datasets/gender-prediction (1).csv"
df = pd.read_csv(file_path)

Mounted at /content/drive


In [4]:
df_encoded = pd.get_dummies(df, columns=['beard', 'hair_length', 'scarf', 'eye_color'], drop_first=True)

X = df_encoded.drop(columns=['gender'])
y = df_encoded['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Monte Carlo cross-validation with error debugging
# Monte Carlo cross-validation parameters
n_splits = 5
test_size = 0.2
random_state = 42

# Monte Carlo cross-validation with error debugging
monte_carlo_cv = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
try:
    monte_carlo_f1_scores = cross_val_score(rf_classifier, X, y, cv=monte_carlo_cv, scoring='f1_macro', error_score='raise')
except Exception as e:
    print("Error:", e)
    raise e

# Print the Monte Carlo Cross-Validation parameters
print(f"Monte Carlo Cross-Validation Parameters: n_splits={n_splits}, test_size={test_size}, random_state={random_state}")

# Print the F1 scores for each iteration of Monte Carlo Cross-Validation
print("\nMonte Carlo Cross-Validation F1 Scores:")
for i, score in enumerate(monte_carlo_f1_scores, 1):
    print(f"Iteration {i}: {score}")

# Leave P-Out cross-validation (e.g., P=2)
leave_p_out = LeavePOut(p=1)
leave_p_out_f1_scores = cross_val_score(rf_classifier, X, y, cv=leave_p_out, scoring='f1_macro')

# Report F1 scores for Leave P-Out Cross-Validation
print("\nLeave P-Out Cross-Validation F1 Scores:", leave_p_out_f1_scores)





Monte Carlo Cross-Validation Parameters: n_splits=5, test_size=0.2, random_state=42

Monte Carlo Cross-Validation F1 Scores:
Iteration 1: 1.0
Iteration 2: 0.8633540372670807
Iteration 3: 0.9494252873563218
Iteration 4: 0.9544513457556936
Iteration 5: 0.9494252873563218

Leave P-Out Cross-Validation F1 Scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
