In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e9:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F76728%2F9057646%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240904%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240904T025957Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D717de538a8442e27de26541732ba8d806340e5da2059e085bc229575cc27877543fcbf979a86bb021294e0ba204b4ee9911b9528c03fabda2578c3805548b6d4d1b4c65cdee9345b7906e89134c87deae6744c2a4c46dd570deaa7d1cc90accbb7afefdc9b6f893e352498b7becba9f64753116654e1bcf063a7dcdf3649067c3216b9befa3c4fb1c270cfc59e60f0b0bf8a0c4830d084542769907e3e4af4a0f8bcac1c8eb3dcab511baf534004933a415181a21f066a8abee5508e82cfa921d94798dba366e8457384a8deedeb7b67c230e582bffc0369521159ada41e8132f4ffa29a04ed1f7290ec3d0a1c054f1554fa42fa86058c9b668f18c4facf6849'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
train_df.head(100)

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
test_ids = test_df['id']
test_df.head(100)

In [None]:
missing_rows_train = train_df.isnull().sum()
print(missing_rows_train)

In [None]:
missing_rows_test = test_df.isnull().sum()
print(missing_rows_test)

In [None]:
# CHECK

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Function to train a Random Forest model and predict missing values
def train_and_predict(df, target_column):
    # Separate the data into features and target
    non_missing = df[df[target_column].notnull()]
    missing = df[df[target_column].isnull()]

    # Features and target for training
    X = non_missing.drop(columns=[target_column])
    y = non_missing[target_column]

    # Categorical features to encode
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    # Create a column transformer for one-hot encoding
    preprocessor = ColumnTransformer(transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

    # Create a pipeline that first transforms the data then fits the model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # Split the data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the Random Forest model
    model.fit(X_train, y_train)

    # Predict missing values
    if not missing.empty:
        missing_predictions = model.predict(missing.drop(columns=[target_column]))
        df.loc[missing.index, target_column] = missing_predictions

# Handle missing values for 'fuel_type'
train_and_predict(train_df, 'fuel_type')

# Handle missing values for 'accident'
train_and_predict(train_df, 'accident')

# Handle missing values for 'clean_title'
train_and_predict(train_df, 'clean_title')

# Final check
print(train_df.isnull().sum())

In [None]:
# Handle missing values for 'fuel_type'
train_and_predict(test_df, 'fuel_type')

# Handle missing values for 'accident'
train_and_predict(test_df, 'accident')

# Handle missing values for 'clean_title'
train_and_predict(test_df, 'clean_title')

# Final check
print(test_df.isnull().sum())

test_df=test_df.drop('clean_title', axis=1)

In [None]:
train_df=train_df.drop('clean_title', axis=1)
train_df.head()

In [None]:
!pip install ray==2.10.0
!pip install autogluon.tabular
!pip install -U ipywidgets

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
print(train_df.describe())
print(train_df.info())

In [None]:
for column in train_df.columns:
    print(f"{column}: {train_df[column].nunique()} unique values")

In [None]:
label = 'price'

predictor = TabularPredictor(label=label, eval_metric='rmse', problem_type="regression").fit(
    train_df,
    presets='medium_quality_faster_train',
    time_limit=3600,  # Try a shorter time limit for quick testing
    verbosity=2
)

results = predictor.fit_summary()

In [None]:
predictor.leaderboard()

In [None]:
y_pred = predictor.predict(test_df)

In [None]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'price': y_pred
})

# Save the predictions to a CSV file
submission.to_csv('submission.csv', index=False)

# Display the first few rows of the predictions
print(submission.head(10))