In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from imblearn.over_sampling import SMOTE

In [2]:
dataset = pd.read_csv("/content/X1_dataset.csv")

In [3]:
X = dataset.drop('formation', axis =1)

In [4]:
y = dataset['formation']

In [5]:
y

0       0
1       0
2       0
3       0
4       0
       ..
8546    4
8547    4
8548    4
8549    4
8550    4
Name: formation, Length: 8551, dtype: int64

In [6]:
y.value_counts()

formation
4    5393
0    1766
2     829
1     307
3     256
Name: count, dtype: int64

In [7]:
# Apply SMOTE for oversampling
oversample = SMOTE(random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X, y)

In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score

# Define inner and outer cross-validation folds
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalize the data
    ('feature_selection', RFE(RandomForestClassifier(random_state=42), n_features_to_select=5)),  # Feature selection using RFE
    ('classifier', RandomForestClassifier(random_state=42))  # Classification using Random Forest
])

# Initialize lists to store nested cross-validation scores
nested_scores = []

# Outer cross-validation loop
for train_index, test_index in outer_cv.split(X_resampled):
    X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Predict on the test set
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy and store the result
    accuracy = accuracy_score(y_test, y_pred)
    nested_scores.append(accuracy)

# Calculate mean accuracy score from nested CV
mean_nested_score = np.mean(nested_scores)
print("Mean Accuracy Score from Nested Cross-Validation:", mean_nested_score)


Mean Accuracy Score from Nested Cross-Validation: 0.9999629148896718


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint


# Define inner and outer cross-validation folds
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalize the data
    ('feature_selection', RFE(RandomForestClassifier(random_state=42), n_features_to_select=5)),  # Feature selection using RFE
    ('classifier', RandomForestClassifier(random_state=42))  # Classification using Random Forest
])

# Define the parameter distributions for hyperparameter tuning
param_dist = {
    'feature_selection__n_features_to_select': randint(5, 20),
    'classifier__n_estimators': randint(50, 200),
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': randint(2, 20)
}

# Initialize lists to store nested cross-validation scores
nested_scores = []

# Outer cross-validation loop (for nested resampling)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in outer_cv.split(X_resampled):
    X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

    # Inner cross-validation loop for hyperparameter tuning
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=inner_cv, scoring='accuracy', random_state=42)
    random_search.fit(X_train, y_train)

    # Get the best model from inner CV
    best_model = random_search.best_estimator_

    # Evaluate the best model on the test set
    y_pred = best_model.predict(X_test)

    # Calculate accuracy and store the result
    accuracy = accuracy_score(y_test, y_pred)
    nested_scores.append(accuracy)

# Calculate mean accuracy score from nested CV
mean_nested_score = np.mean(nested_scores)
print("Mean Accuracy Score from Nested Cross-Validation:", mean_nested_score)


Mean Accuracy Score from Nested Cross-Validation: 0.9999629148896718


In [3]:
data = pd.read_csv("/content/winequality-white.csv", sep=';')

In [4]:
data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [5]:
X1 = data.drop('quality', axis =1)

In [6]:
y1 = data['quality']

In [7]:
y1.value_counts()

quality
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: count, dtype: int64

In [8]:
# Apply SMOTE for oversampling
oversample = SMOTE(random_state=42, k_neighbors=4)  # Specify a value for k_neighbors that is less than or equal to the number of samples
X1_resampled, y1_resampled = oversample.fit_resample(X1, y1)

In [19]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score

# Define inner and outer cross-validation folds
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalize the data
    ('feature_selection', RFE(RandomForestClassifier(random_state=42), n_features_to_select=10)),  # Feature selection using RFE
    ('classifier', RandomForestClassifier(random_state=42))  # Classification using Random Forest
])

# Initialize lists to store nested cross-validation scores
nested_scores = []

# Outer cross-validation loop
for train_index, test_index in outer_cv.split(X1_resampled):
    X1_train, X1_test = X1_resampled.iloc[train_index], X1_resampled.iloc[test_index]
    y1_train, y1_test = y1_resampled.iloc[train_index], y1_resampled.iloc[test_index]

    # Fit the pipeline on the training data
    pipeline.fit(X1_train, y1_train)

    # Predict on the test set
    y1_pred = pipeline.predict(X1_test)

    # Calculate accuracy and store the result
    accuracy = accuracy_score(y1_test, y1_pred)
    nested_scores.append(accuracy)

# Calculate mean accuracy score from nested CV
mean_nested_score = np.mean(nested_scores)
print("Mean Accuracy Score from Nested Cross-Validation:", mean_nested_score)


Mean Accuracy Score from Nested Cross-Validation: 0.887039518294044


In [9]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint


# Define inner and outer cross-validation folds
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalize the data
    ('feature_selection', RFE(RandomForestClassifier(random_state=42), n_features_to_select=10)),  # Feature selection using RFE
    ('classifier', RandomForestClassifier(random_state=42))  # Classification using Random Forest
])

# Define the parameter distributions for hyperparameter tuning
param_dist = {
    'feature_selection__n_features_to_select': randint(5, 20),
    'classifier__n_estimators': randint(50, 200),
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': randint(2, 20)
}

# Initialize lists to store nested cross-validation scores
nested_scores = []


# Outer cross-validation loop (for nested resampling)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in outer_cv.split(X1_resampled):
    X1_train, X1_test = X1_resampled.iloc[train_index], X1_resampled.iloc[test_index]
    y1_train, y1_test = y1_resampled.iloc[train_index], y1_resampled.iloc[test_index]

    # Inner cross-validation loop for hyperparameter tuning
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=50, cv=inner_cv, scoring='accuracy', random_state=42)
    random_search.fit(X1_train, y1_train)

    # Get the best model from inner CV
    best_model = random_search.best_estimator_

    # Evaluate the best model on the test set
    y1_pred = best_model.predict(X1_test)

    # Calculate accuracy and store the result
    accuracy = accuracy_score(y1_test, y1_pred)
    nested_scores.append(accuracy)

# Calculate mean accuracy score from nested CV
mean_nested_score = np.mean(nested_scores)
print("Mean Accuracy Score from Nested Cross-Validation:", mean_nested_score)

Mean Accuracy Score from Nested Cross-Validation: 0.8918490812908363
