In [None]:
from google.colab import files

# This will prompt you to select the kaggle.json file
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c multiclass-classification-jaro

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip '/content/multiclass-classification-jaro.zip' -d '/content/dataset'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score

# Load your dataset
train_df = pd.read_csv('/content/dataset/train.csv')

# Feature Engineering: Create a new feature as an example
train_df['total_acidity'] = train_df['fixed acidity'] + train_df['volatile acidity']

# Encode the 'wine type' column
le = LabelEncoder()
train_df['wine type'] = le.fit_transform(train_df['wine type'])

# Preparing the dataset
X = train_df.drop(['ID', 'quality'], axis=1)
y = train_df['quality']


# Data preprocessing
numerical_features = X.select_dtypes(include=['float64', 'int']).columns.tolist()
categorical_features = ['wine type']

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', 'passthrough', categorical_features)
])

# Creating a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Using F1 score as the metric for scoring
f1_scorer = make_scorer(f1_score, average='weighted')

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer, verbose=2, n_jobs=-1)

grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)
print("Best CV F1 score: ", grid_search.best_score_)


In [None]:
import pandas as pd


test_df = pd.read_csv('/content/dataset/test.csv')

# Apply the same feature engineering to the test set
test_df['total_acidity'] = test_df['fixed acidity'] + test_df['volatile acidity']
test_df['wine type'] = le.transform(test_df['wine type'])  # Use the LabelEncoder fitted on the training set

# Prepare the test data (drop ID for prediction)
X_test = test_df.drop('ID', axis=1)

# Make predictions using the best estimator from the grid search
predictions = grid_search.best_estimator_.predict(X_test)

# Create the submission DataFrame
submission_df = pd.DataFrame({'ID': test_df['ID'], 'quality': predictions})

# Save the submission file
submission_file_path = '/content/dataset/wine_quality_submission.csv'
submission_df.to_csv(submission_file_path, index=False)

submission_file_path


In [None]:
!mkdir -p "/content/drive/My Drive/KaggleSubmissions"


In [None]:
!cp /content/dataset/wine_quality_submission.csv "/content/drive/My Drive/KaggleSubmissions"
