In [None]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load the Wine dataset
wine_data = load_wine()
X, y = wine_data.data, wine_data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Cross-Validation Accuracy: 0.9716748768472907
Cross-Validation Precision: 0.9694276030482927
Cross-Validation Recall: 0.9785714285714286
Test Accuracy: 1.0
Test Precision: 1.0
Test Recall: 1.0


In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import OneHotEncoder

# Fetch the Mushroom dataset
mushroom = fetch_openml(name='mushroom')

# Split the dataset into features (X) and target labels (y)
X, y = mushroom.data, mushroom.target

# Convert target labels to integers
y = np.array([1 if label == 'e' else 0 for label in y])

# Perform one-hot encoding for categorical variables in X
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


  warn(
  warn(


Cross-Validation Accuracy: 1.0
Cross-Validation Precision: 1.0
Cross-Validation Recall: 1.0
Test Accuracy: 1.0
Test Precision: 1.0
Test Recall: 1.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
data = pd.read_csv(url, names=names)

# Convert categorical variables into numerical values
data = pd.get_dummies(data)

# Split features and target label
X = data.drop(columns=['class_unacc', 'class_acc', 'class_good', 'class_vgood'])
y = data[['class_unacc', 'class_acc', 'class_good', 'class_vgood']]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = np.mean(cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy'))

# Fit the best model on the full training set
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)

print("Cross-Validation Accuracy:", cv_accuracy)
print("Test Accuracy:", accuracy)


Cross-Validation Accuracy: 0.933401349866583
Test Accuracy: 0.9450867052023122


In [None]:
# prompt: Cross-Validation Accuracy, precision and recall

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))


Cross-Validation Accuracy: 0.933401349866583
Cross-Validation Precision: 1.0
Cross-Validation Recall: 1.0


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Step 1: Load the Spam Email Dataset from your computer
# Replace 'path_to_spam_dataset' with the actual file path where your dataset is located
file_path = '/content/spam_ham_dataset.csv'  # Replace 'path_to_spam_dataset.csv' with your file path
data = pd.read_csv(file_path)

# Assuming your dataset contains two columns: 'text' (email content) and 'label' (spam or ham)
X = data['text']
y = data['label']

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Feature Extraction using TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 4: Define the model
rf_model = RandomForestClassifier()

# Step 5: Define hyperparameters to tune
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Step 6: Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train_tfidf, y_train)

# Step 7: Get the best model
best_rf_model = random_search.best_estimator_

# Step 8: Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train_tfidf, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train_tfidf, y_train, cv=5, scoring='recall_weighted')

# Step 9: Fit the best model on the full training set
best_rf_model.fit(X_train_tfidf, y_train)

# Step 10: Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test_tfidf)

# Step 11: Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Step 12: Print evaluation metrics
print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Cross-Validation Accuracy: 0.9734051253293143
Cross-Validation Precision: 0.9721498233908236
Cross-Validation Recall: 0.9719546816676304
Test Accuracy: 0.966183574879227
Test Precision: 0.9660559461540551
Test Recall: 0.966183574879227
