<a href="https://colab.research.google.com/github/EnmaSantos/CSE450/blob/main/ice_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

# Step 1: Load the Data
data = pd.read_csv("https://byui-cse.github.io/cse450-course/ice/wine/data/wine-training.csv")

# Step 2: Handle Missing Values
data.dropna(inplace=True)  # Drop rows with missing values

# Step 3: Separate Features and Target
X = data.drop(columns=["wine"])
y = data["wine"]

# Step 4: Split the Data (Use test_size=0.2 for a balanced split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Preprocessing and Model Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Feature scaling
    ("classifier", RandomForestClassifier(random_state=42))
])


param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [10, 20, None],
    "classifier__min_samples_split": [2, 5]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)

# Step 7: Evaluate the Best Model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Step 8: Load Holdout Data and Make Predictions
holdout_data = pd.read_csv("https://byui-cse.github.io/cse450-course/ice/wine/data/wine-holdout.csv")  # Replace with actual holdout file path
holdout_predictions = best_model.predict(holdout_data)

# Save Predictions to CSV
output = pd.DataFrame({"Class": holdout_predictions})
output.to_csv("EnmanuelDeLosSantos-ice1-predictions.csv", index=False)

# Feature Importance (Optional)
feature_importances = best_model.named_steps["classifier"].feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances})
print(importance_df.sort_values(by="Importance", ascending=False))

Accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.89      0.89      0.89         9
           2       0.88      1.00      0.93         7

    accuracy                           0.92        24
   macro avg       0.92      0.92      0.92        24
weighted avg       0.92      0.92      0.92        24

                 Feature  Importance
9        color_intensity    0.207873
12               proline    0.152110
6             flavanoids    0.130997
11                    od    0.121029
0                alcohol    0.101475
10                   hue    0.083554
3      alcalinity_of_ash    0.050673
5          total_phenols    0.042990
4              magnesium    0.034809
8        proanthocyanins    0.030713
1             malic_acid    0.027944
2                    ash    0.010155
7   nonflavanoid_phenols    0.005679


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline


data = pd.read_csv("https://byui-cse.github.io/cse450-course/ice/wine/data/wine-training.csv")


data.dropna(inplace=True)  # Drop rows with missing values


X = data.drop(columns=["wine"])
y = data["wine"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline = Pipeline([
    ("scaler", StandardScaler()),  # Feature scaling
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42))
])


param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [3, 5, 7],
    "classifier__learning_rate": [0.01, 0.1],
    "classifier__subsample": [0.8, 1.0]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


holdout_data = pd.read_csv("https://byui-cse.github.io/cse450-course/ice/wine/data/wine-holdout.csv")  # Replace with actual holdout file path
holdout_predictions = best_model.predict(holdout_data)


output = pd.DataFrame({"Class": holdout_predictions})
output.to_csv("EnmanuelDeLosSantos-ice1-predictions.csv", index=False)


feature_importances = best_model.named_steps["classifier"].feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importances})
print(importance_df.sort_values(by="Importance", ascending=False))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.89      0.89      0.89         9
           2       0.88      1.00      0.93         7

    accuracy                           0.92        24
   macro avg       0.92      0.92      0.92        24
weighted avg       0.92      0.92      0.92        24

                 Feature  Importance
11                    od    0.355234
9        color_intensity    0.169504
12               proline    0.140103
6             flavanoids    0.086282
5          total_phenols    0.068309
4              magnesium    0.054098
10                   hue    0.028345
0                alcohol    0.027942
2                    ash    0.017145
3      alcalinity_of_ash    0.017142
7   nonflavanoid_phenols    0.014102
1             malic_acid    0.013203
8        proanthocyanins    0.008590
