In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e8:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F76727%2F9045607%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240828%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240828T105824Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1854267c167371753d2537b0640a405362928b413dd93493e2a9f6fb9a242b0ec5a845977f19064d7d66cd482422db29e6ce0cbd9b81bb25a3b2dd0bfb00c44fc82aacc118fd004b01ff7bbbc6f516a40ff3d8ca18ba49c27f8bd4646652bb9b469cc20868971a0aae6f743c574cf8f63172bc33afdc73ed668e7121fdc2b2c24661f5a487daada36081065a40d692e9dbccfe3ceab7b76a565d86058d7e5446bd80912341eae1745e843c9745f4a9a5ceb529699092c3a14d0b2e6684802290448a66996aa711a73930dca890819991999be8d536e4893b9d9ab886810cf37ae174afe9f47c907c69ae60f76ef61f069c4e257912d2dffb9991de1d707d08e5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load the datasets
df = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

In [None]:
df.head()

In [None]:
missing_rows = df.isnull().sum()
print(missing_rows)

In [None]:
missing_rows_test = df_test.isnull().sum()
print(missing_rows_test)

In [None]:
y_train = df[['class', 'id']]
df = df.drop('class', axis=1)
df = df.drop('id', axis=1)
y_test = df_test[['id']]
df_test = df_test.drop('id', axis=1)

In [None]:
def advanced_impute(df, columns=None, chunk_size=10000):
    if columns is None:
        columns = df.columns.tolist()

    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in the dataframe")

    numeric_columns = df[columns].select_dtypes(include=[np.number]).columns
    categorical_columns = df[columns].select_dtypes(exclude=[np.number]).columns

    # Process numeric columns
    if len(numeric_columns) > 0:
        for col in numeric_columns:
            if df[col].isnull().sum() > 0:
                # Use median for imputation (more robust than mean)
                median = df[col].median()
                df[col].fillna(median, inplace=True)

    # Process categorical columns
    if len(categorical_columns) > 0:
        for col in categorical_columns:
            if df[col].isnull().sum() > 0:
                if df[col].isnull().sum() / len(df) < 0.1:
                    mode = df[col].mode()[0]
                    df[col].fillna(mode, inplace=True)
                else:
                    df[col].fillna('Unknown', inplace=True)

    # Check for any remaining missing values
    remaining_missing = df[columns].isnull().sum()
    if remaining_missing.sum() > 0:
        print("Remaining missing values:")
        print(remaining_missing[remaining_missing > 0])

        # Use simple imputation for any remaining missing values
        if len(numeric_columns) > 0:
            numeric_imputer = SimpleImputer(strategy='median')
            for start in range(0, len(df), chunk_size):
                end = start + chunk_size
                df.iloc[start:end, df.columns.get_indexer(numeric_columns)] = numeric_imputer.fit_transform(df.iloc[start:end][numeric_columns])

        if len(categorical_columns) > 0:
            cat_imputer = SimpleImputer(strategy='most_frequent')
            for start in range(0, len(df), chunk_size):
                end = start + chunk_size
                df.iloc[start:end, df.columns.get_indexer(categorical_columns)] = cat_imputer.fit_transform(df.iloc[start:end][categorical_columns])

    return df[columns]

In [None]:
# Perform imputation for df
df_final = advanced_impute(df)

# Check for any remaining missing values in df_final
print("\nFinal check for missing values in df_final:")
print(df_final.isnull().sum())

In [None]:
# Perform imputation for df_test
df_test_final = advanced_impute(df_test)

# Check for any remaining missing values in df_test_final
print("\nFinal check for missing values in df_test_final:")
print(df_test_final.isnull().sum())

In [None]:
# Identify numeric and categorical columns
numeric_columns = df_final.select_dtypes(include=[np.number]).columns
categorical_columns = df_final.select_dtypes(exclude=[np.number]).columns

# Standardize numerical columns
scaler = StandardScaler()
df_final[numeric_columns] = scaler.fit_transform(df_final[numeric_columns])
df_test_final[numeric_columns] = scaler.transform(df_test_final[numeric_columns])

# Ordinal encode categorical columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_final[categorical_columns] = encoder.fit_transform(df_final[categorical_columns])
df_test_final[categorical_columns] = encoder.transform(df_test_final[categorical_columns])

In [None]:
# Calculate correlations
correlations = df_final.corr().abs()

# Get the top 3 strongest correlations
top_correlations = correlations.unstack().sort_values(ascending=False).drop_duplicates()
top_3_correlations = top_correlations[1:4]  # Exclude the diagonal (correlation of 1 with itself)

print("Top 3 strongest correlations:")
print(top_3_correlations)

# Create 3 new features based on the top correlations
for i, (pair, corr_value) in enumerate(top_3_correlations.items(), 1):
    feature1, feature2 = pair
    new_feature_name = f'new_feature_{i}'
    df_final[new_feature_name] = df_final[feature1] * df_final[feature2]
    df_test_final[new_feature_name] = df_test_final[feature1] * df_test_final[feature2]
    print(f"Created {new_feature_name} from {feature1} and {feature2}")

print("\nShape of final training set:", df_final.shape)
print("Shape of final test set:", df_test_final.shape)

In [None]:
feature_names = [
    'stem-width',
    'gill-attachment',
    'cap-surface',
    'gill-spacing',
    'stem-color',
    'stem-height',
    'gill-color',
    'cap-diameter',
    'new_feature_1',
    'stem-surface',
    'cap-color',
    'cap-shape',
    'stem-root',
    'new_feature_3',
    'does-bruise-or-bleed',
    'ring-type',
    'habitat',
    'has-ring',
    'new_feature_2',
    'spore-print-color'
]

In [None]:
# Create the final DataFrames
df_final = df_final[feature_names]
df_test_final = df_test_final[feature_names]

In [None]:
# Split the training data 90/10 for validation
X_train, X_val, y_train, y_val = train_test_split(df_final, y_train['class'], test_size=0.1, random_state=42)

# Encode the target variable
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train_encoded)

# Train XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train_encoded)

# Train CatBoost Classifier
cat_classifier = CatBoostClassifier(random_state=42, verbose=0)
cat_classifier.fit(X_train, y_train_encoded)

# Make predictions on validation set
y_pred_rf_val = rf_classifier.predict(X_val)
y_pred_xgb_val = xgb_classifier.predict(X_val)
y_pred_cat_val = cat_classifier.predict(X_val)

# Calculate individual model accuracies on validation set
acc_rf = accuracy_score(y_val_encoded, y_pred_rf_val)
acc_xgb = accuracy_score(y_val_encoded, y_pred_xgb_val)
acc_cat = accuracy_score(y_val_encoded, y_pred_cat_val)

print(f"Random Forest Classifier Accuracy: {acc_rf:.4f}")
print(f"XGBoost Classifier Accuracy: {acc_xgb:.4f}")
print(f"CatBoost Classifier Accuracy: {acc_cat:.4f}")

# Combine the predictions for validation set
y_pred_ensemble_val = (y_pred_rf_val + y_pred_xgb_val + y_pred_cat_val) / 3
y_pred_ensemble_val = (y_pred_ensemble_val > 0.5).astype(int)  # Convert probabilities to class labels

# Calculate the ensemble accuracy on validation set
acc_ensemble = accuracy_score(y_val_encoded, y_pred_ensemble_val)
print(f"\nEnsemble Accuracy on Validation Set: {acc_ensemble:.4f}")

In [None]:
# Make predictions on the test set
y_pred_rf_test = rf_classifier.predict(df_test_final)
y_pred_xgb_test = xgb_classifier.predict(df_test_final)
y_pred_cat_test = cat_classifier.predict(df_test_final)

# Combine the predictions for test set
y_pred_ensemble_test = (y_pred_rf_test + y_pred_xgb_test + y_pred_cat_test) / 3
y_pred_ensemble_test = (y_pred_ensemble_test > 0.5).astype(int)  # Convert probabilities to class labels

# Convert numeric predictions back to 'p' and 'e'
y_pred_final = le.inverse_transform(y_pred_ensemble_test)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': y_test['id'],
    'class': y_pred_final
})

# Save submission to CSV
submission.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' has been created.")
