<a href="https://colab.research.google.com/github/ChiefGupta/Project1/blob/main/Project1_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Essential libraries
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For data processing and machine learning
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# If using Google Colab to download files
from google.colab import files

# Additional tools
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# Load the test data
test_df = pd.read_csv('/content/drive/My Drive/database/test.tsv', sep='\t')

# Display the first few rows of the test data
test_df.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [5]:
# Define preprocessing functions (same as used during training)
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].replace('No description yet', 'missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

In [6]:
# Apply preprocessing to test data
handle_missing_inplace(test_df)
to_categorical(test_df)

In [None]:
!pip install joblib
import joblib
# Vectorize the features using the same vectorizers fitted on training data
# Load the fitted vectorizers from the training data
# Ensure the correct path to the 'cv.pkl' file is provided. If it's in a different directory, include the full path.
cv = joblib.load('/content/drive/My Drive/database/cv.pkl') # Assuming you saved your CountVectorizer as 'cv.pkl'
tv = joblib.load('/content/drive/My Drive/database/tv.pkl') # Assuming you saved your TfidfVectorizer as 'tv.pkl'
lb = joblib.load('/content/drive/My Drive/database/lb.pkl') # Assuming you saved your LabelBinarizer as 'lb.pkl'

X_name_test = cv.transform(test_df['name'])
X_category_test = cv.transform(test_df['category_name'])
X_description_test = tv.transform(test_df['item_description'])
X_brand_test = lb.transform(test_df['brand_name'])
X_dummies_test = csr_matrix(pd.get_dummies(test_df[['item_condition_id', 'shipping']], sparse=True).values)

# Create a sparse matrix for the test data
sparse_merge_test = hstack((X_dummies_test, X_description_test, X_brand_test, X_category_test, X_name_test)).tocsr()

# Remove features with document frequency <= 1 if applied during training
mask = np.array(np.clip(sparse_merge_test.getnnz(axis=0) - 1, 0, 1), dtype=bool)
sparse_merge_test = sparse_merge_test[:, mask]

In [13]:
# Load the trained LightGBM model
model = lgb.Booster(model_file='/content/drive/My Drive/trained_lightgbm_model.txt')

In [27]:
# Make predictions on the test data
y_pred_test = model.predict(sparse_merge_test, num_iteration=model.best_iteration, predict_disable_shape_check=True)

#If the model was trained on log-transformed prices, apply exponential transformation to get actual prices
y_pred_test = np.expm1(y_pred_test)

In [28]:
# Save predictions to a DataFrame, including additional information like 'name' and 'category_name'
output = pd.DataFrame({
    'test_id': test_df['test_id'],
    'name': test_df['name'],
    'category_name': test_df['category_name'],
    'predicted_price': y_pred_test
})

# Save the predictions to a CSV file
output.to_csv('price_predictions.csv', index=False)

# Download the prediction file if running in Google Colab
files.download('price_predictions.csv')

# Display the first few rows of predictions to verify
output.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,test_id,name,category_name,predicted_price
0,0,"Breast cancer ""I fight like a girl"" ring",Women/Jewelry/Rings,14.996726
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",Other/Office supplies/Shipping Supplies,11.239409
2,2,Coach bag,Vintage & Collectibles/Bags and Purses/Handbag,24.59712
3,3,Floral Kimono,Women/Sweaters/Cardigan,16.476449
4,4,Life after Death,Other/Books/Religion & Spirituality,10.354098
