In [5]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import warnings

### Load the data

In [7]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test_Vges7qu.csv')
sample_submission = pd.read_csv('sample_submission_V9Inaty.csv')

In [11]:
print("Training Data Head:")
print(train_df.head())

# Display the information about the training data, including data types and missing values
print("\nTraining Data Info:")
print(train_df.info())

# Display the information about the test data
print("\nTest Data Info:")
print(test_df.info())

Training Data Head:
   User_ID Product_ID Gender   Age  Occupation City_Category  \
0  1000001  P00069042      F  0-17          10             A   
1  1000001  P00248942      F  0-17          10             A   
2  1000001  P00087842      F  0-17          10             A   
3  1000001  P00085442      F  0-17          10             A   
4  1000002  P00285442      M   55+          16             C   

  Stay_In_Current_City_Years  Marital_Status  Product_Category_1  \
0                          2               0                   3   
1                          2               0                   1   
2                          2               0                  12   
3                          2               0                  12   
4                         4+               0                   8   

   Product_Category_2  Product_Category_3  Purchase  
0                 NaN                 NaN      8370  
1                 6.0                14.0     15200  
2                 NaN   

### Data Preprocessing and Feature Engineering

In [14]:
# Combine train and test data for consistent preprocessing
train_purchase = train_df['Purchase']
train_df.drop('Purchase', axis=1, inplace=True)

# Align columns before concatenation
test_df.drop('Comb', axis=1, inplace=True)

# Concatenate for preprocessing
combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)

# Handle the 'Stay_In_Current_City_Years' column
# Replace '4+' with 4 and convert to integer type
combined_df['Stay_In_Current_City_Years'] = combined_df['Stay_In_Current_City_Years'].str.replace('+', '').astype(int)

# Fill missing values in Product_Category_2 and Product_Category_3 with a placeholder (e.g., 0)
# This is a common practice when missing values represent the absence of a feature
combined_df['Product_Category_2'].fillna(0, inplace=True)
combined_df['Product_Category_3'].fillna(0, inplace=True)

# Convert `Product_ID` and `User_ID` to categorical type for LightGBM
# This is an efficient way to handle high-cardinality categorical features
categorical_features = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Occupation', 'Marital_Status', 'Product_ID', 'User_ID']

for feature in categorical_features:
    combined_df[feature] = combined_df[feature].astype('category')
    
# Separate the combined data back into training and test sets
train_processed = combined_df[:len(train_df)]
test_processed = combined_df[len(train_df):]

# Add the 'Purchase' column back to the training data
train_processed['Purchase'] = train_purchase

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Product_Category_2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Product_Category_3'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

### Model Building 

In [17]:
pip install lightgbm




In [19]:
import lightgbm as lgb

In [21]:
# Define features (X) and target (y)
X = train_processed.drop(['Purchase'], axis=1)
y = train_processed['Purchase']

# Define categorical features for LightGBM
cat_features = [col for col in X.columns if X[col].dtype.name == 'category']

# Initialize the LightGBM Regressor model
params = {
    'objective': 'regression_l1', # MAE objective, good for robustness to outliers
    'metric': 'rmse',
    'n_estimators': 1500,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
}

model = lgb.LGBMRegressor(**params)

# Train the model
print("\nTraining the model...")
model.fit(X, y, categorical_feature=cat_features)
print("Model training complete.")


Training the model...
Model training complete.


### Prediction and Submission

In [24]:
# Make predictions on the preprocessed test data
test_predictions = model.predict(test_processed)

# Ensure predictions are positive since purchase amounts cannot be negative
test_predictions[test_predictions < 0] = 0

# Create the submission DataFrame in the required format
submission_df = pd.DataFrame({
    'User_ID': test_processed['User_ID'].astype(int),
    'Product_ID': test_processed['Product_ID'].astype(str),
    'Purchase': test_predictions
})

# Save the submission file to a CSV
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully.")
print("\nSample of the submission file:")
print(submission_df.head())


Submission file 'submission.csv' created successfully.

Sample of the submission file:
        User_ID Product_ID      Purchase
550068  1000004  P00128942  16349.017264
550069  1000009  P00113442  11526.149683
550070  1000010  P00288442   6105.360659
550071  1000010  P00145342   4133.195845
550072  1000011  P00053842   2609.572352
