In [None]:
import numpy as np
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# Load the Wine dataset
wine = load_wine()
X, y = wine.data, wine.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Selection
rf_model = RandomForestClassifier()
selector = SelectFromModel(estimator=rf_model).fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train_selected, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train_selected, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train_selected, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test_selected)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Cross-Validation Accuracy: 0.9785714285714286
Cross-Validation Precision: 0.9671825396825398
Cross-Validation Recall: 0.9645320197044336
Test Accuracy: 0.9722222222222222
Test Precision: 0.974074074074074
Test Recall: 0.9722222222222222


In [None]:
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel

# Upload the CSV file
uploaded = files.upload()

# Load the dataset from the uploaded file
for file_name in uploaded.keys():
    data = pd.read_csv(file_name)

# Separate features (X) and target variable (y)
X = data.drop(columns=['label'])
y = data['label']

# Preprocess the data if necessary (e.g., scaling features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Saving typedCSV.csv to typedCSV.csv


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Cross-Validation Accuracy: 0.8642270752462572
Cross-Validation Precision: 0.8644292903534778
Cross-Validation Recall: 0.8640285936379444
Test Accuracy: 0.871259623779665
Test Precision: 0.871438031087417
Test Recall: 0.871259623779665


In [None]:
from google.colab import files
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

# Load the dataset from the uploaded file
file_name = "typedCSV.csv"  # Replace "your_uploaded_file.csv" with the actual file name
data = pd.read_csv(file_name)

# Separate features (X) and target variable (y)
X = data.drop(columns=['label'])
y = data['label']

# Preprocess the data if necessary (e.g., scaling features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=30, min_samples_split=2, min_samples_leaf=1, max_features='sqrt')

# Use stratified k-fold cross-validation
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the model using cross-validation
cv_accuracy = cross_val_score(rf_model, X_train, y_train, cv=stratified_cv, scoring='accuracy')
cv_precision = cross_val_score(rf_model, X_train, y_train, cv=stratified_cv, scoring='precision_weighted')
cv_recall = cross_val_score(rf_model, X_train, y_train, cv=stratified_cv, scoring='recall_weighted')

# Fit the model on the full training set
rf_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Cross-Validation Accuracy: 0.8650805510844167
Cross-Validation Precision: 0.8654544087176971
Cross-Validation Recall: 0.8646042385417161
Test Accuracy: 0.8747519644416224
Test Precision: 0.8749452910368523
Test Recall: 0.8747519644416224


In [5]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define a pipeline with preprocessing steps and RandomForestClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', RandomForestClassifier())
])

# Define hyperparameters to search over
param_grid = {
    'pca__n_components': [None, 2, 4, 6, 8],  # Adjust PCA components
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, 15, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_model, X_train, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


Cross-Validation Accuracy: 0.7639477542316406
Cross-Validation Precision: 0.7661626218116598
Cross-Validation Recall: 0.7720645075303211
Test Accuracy: 0.7597402597402597
Test Precision: 0.7630080049261084
Test Recall: 0.7597402597402597


In [6]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Load the Pima Indians Diabetes dataset
diabetes = fetch_openml(name='diabetes')

# Split the dataset into features (X) and target labels (y)
X, y = diabetes.data, diabetes.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


  warn(
  warn(


Cross-Validation Accuracy: 0.7720245235239238
Cross-Validation Precision: 0.7712167042252224
Cross-Validation Recall: 0.7752898840463814
Test Accuracy: 0.7597402597402597
Test Precision: 0.7607507288629737
Test Recall: 0.7597402597402597


In [7]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Fetch the Spambase dataset
spambase = fetch_openml(name='spambase')

# Split the dataset into features (X) and target labels (y)
X, y = spambase.data, spambase.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestClassifier()

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
cv_accuracy = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted')

# Fit the best model on the full training set
best_rf_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)


  warn(


Cross-Validation Accuracy: 0.9451086956521738
Cross-Validation Precision: 0.9453387901583167
Cross-Validation Recall: 0.9448369565217393
Test Accuracy: 0.9489685124864278
Test Precision: 0.9495696426302883
Test Recall: 0.9489685124864278


In [9]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import category_encoders as ce

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
names = ["sex", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings"]
data = pd.read_csv(url, names=names)

# Convert categorical variable "sex" into numerical values
encoder = ce.OrdinalEncoder(cols=['sex'])
data = encoder.fit_transform(data)

# Split features and target label
X = data.drop(columns=['rings'])
y = data['rings']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
rf_model = RandomForestRegressor()

# Define hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_rmse = np.sqrt(-grid_search.best_score_)

# Fit the best model on the full training set
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set

In [11]:
# prompt: cross validation accuracy, precision and recall

cv_accuracy = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='precision_weighted')
cv_recall = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='recall_weighted')

print("Cross-Validation Accuracy:", np.mean(cv_accuracy))
print("Cross-Validation Precision:", np.mean(cv_precision))
print("Cross-Validation Recall:", np.mean(cv_recall))


Cross-Validation Accuracy: 0.9451086956521738
Cross-Validation Precision: 0.9453387901583167
Cross-Validation Recall: 0.9448369565217393
