In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from google.colab import drive

In [None]:
folder_path = "/content/drive/My Drive/Backpack Prediction Challenge/"
train_path = folder_path + "train.csv"
train_extra_path = folder_path + "training_extra.csv"
test_path = folder_path + "test.csv"

In [None]:
train = pd.read_csv(train_path)
train_extra = pd.read_csv(train_extra_path)
test = pd.read_csv(test_path)

In [None]:
train = pd.concat([train, train_extra], ignore_index=True)

In [None]:
print("Dataset Overview:")
print(train.info())
print("\nFirst Few Rows:")
print(train.head())

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 335.2+ MB
None

First Few Rows:
   id         Brand Material    Size  Compartments Laptop Compartment  \
0   0      Jansport  Leather  Medium           7.0                Yes   
1   1      Jansport   Canvas   Small          10.0                Yes   
2   2  Under Armour  Leather   Small           2.0                Yes   
3   3          Nike    Nylon   Small

In [None]:
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical Columns Detected:", categorical_cols)


Categorical Columns Detected: ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']


In [None]:
encoder = LabelEncoder()
for col in categorical_cols:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.transform(test[col])

In [None]:
print("\nMissing Values Before Handling:")
print(train.isnull().sum())


Missing Values Before Handling:
id                         0
Brand                      0
Material                   0
Size                       0
Compartments               0
Laptop Compartment         0
Waterproof                 0
Style                      0
Color                      0
Weight Capacity (kg)    1808
Price                      0
dtype: int64


In [None]:
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

print("\nMissing Values After Handling:")
print(train.isnull().sum())


Missing Values After Handling:
id                      0
Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
Price                   0
dtype: int64


In [None]:
X = train.drop(columns=['Price', 'id'])  # Features
y = train['Price']  # Target
X_test = test.drop(columns=['id'])  # Test Features

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"\nMean Absolute Error (Validation Set): {mae:.2f}")


Mean Absolute Error (Validation Set): 34.41


In [None]:
test_predictions = model.predict(X_test)

In [None]:
# Load sample submission file (Ensure this runs before predictions)
submission_path = "/content/drive/My Drive/Backpack Prediction Challenge/sample_submission.csv"
submission = pd.read_csv(submission_path)

# Add predictions and save file
submission['Price'] = test_predictions
output_file_path = "/content/drive/My Drive/Backpack Prediction Challenge/my_submission.csv"
submission.to_csv(output_file_path, index=False)

print(f"\n✅ Submission file saved at: {output_file_path}")



✅ Submission file saved at: /content/drive/My Drive/Backpack Prediction Challenge/my_submission.csv


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(train.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()