In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# Load and preprocess data
data = pd.read_csv('/content/drive/MyDrive/dataset/train.csv')  # Load product data from CSV file

data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [None]:

X=data.drop('PRODUCT_LENGTH',axis=1)
y=data['PRODUCT_LENGTH']

# Set the batch size
batch_size = 1000

# Split the dataset into batches
X_batches = []
y_batches = []

for i in range(0, len(X), batch_size):
    X_batch, y_batch = X[i:i+batch_size], y[i:i+batch_size]
    X_batches.append(X_batch)
    y_batches.append(y_batch)

In [None]:
X_batches[0] # this proves that there no issues in the batch creation

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112
...,...,...,...,...,...
995,1997414,"Green India Napier grass seeds, Elephant grass...",NAPIER GRASS (Pennisetum purpureum) is popular...,Green India - Napier Elephant Pennisetum Purpu...,6503
996,219646,Carranza and His Bolshevik Regime,,,6108
997,2824714,HiEnd Accents Stella Faux Silk Velvet Western ...,[DECORATIVE PILLOW: (1) Floral Embroidered Lum...,,7258
998,1563344,Lilis Women's Slim 2 Piece Suits Lady Blazer P...,[MATERIAL: Polyester.No iron fabric.The good q...,,3063


In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
encoder = LabelEncoder()

# Loop through X_batches and apply label encoding on each batch
X_batches_encoded = []
for X_batch in X_batches:
    X_batch_encoded = X_batch.copy()
    for column in X_batch.columns:
        if X_batch[column].dtype == 'object' and column!='PRODUCT_ID':
            X_batch_encoded[column] = encoder.fit_transform(X_batch[column])
    X_batches_encoded.append(X_batch_encoded)

In [None]:
X_batches_encoded[4]

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
4000,2101897,993,404,442,2211
4001,501600,212,595,457,80
4002,980181,975,595,457,2277
4003,673986,519,595,457,146
4004,2952238,445,382,181,12990
...,...,...,...,...,...
4995,631832,194,595,457,1
4996,1216168,835,315,255,3299
4997,586891,431,595,457,152
4998,163109,810,595,457,120


In [None]:
# noise removal

from scipy.signal import savgol_filter

# Loop through all the batches of data
for i, batch in enumerate(X_batches_encoded):
    # Apply the Savitzky-Golay filter to the current batch
    smoothed_batch = savgol_filter(batch, window_length=5, polyorder=2)
    # Replace the original batch with the smoothed batch
    X_batches_encoded[i] = smoothed_batch

In [None]:
import numpy as np

lower_percentile = 1  # Example: 1st percentile
upper_percentile = 99  # Example: 99th percentile

# Loop through all the batches of X
for i, X_batch in enumerate(X_batches_encoded):
    # Convert NumPy array to Pandas DataFrame
    X_batch_df = pd.DataFrame(X_batch, columns=['PRODUCT_ID','TITLE','BULLET_POINTS','DESCRIPTION',"PRODUCT_TYPE_ID"])  # Replace column names as appropriate
    # Check for null values in X_batch_df
    if X_batch_df.isnull().sum().sum() == 0:
        # Loop through all the numeric columns in the batch
        for col in X_batch_df.select_dtypes(include=np.number).columns:
            # Apply winsorizing to the current column
            lower_threshold = np.percentile(X_batch_df[col], lower_percentile)
            upper_threshold = np.percentile(X_batch_df[col], upper_percentile)
            X_batch_df[col] = np.where(X_batch_df[col] < lower_threshold, lower_threshold, X_batch_df[col])
            X_batch_df[col] = np.where(X_batch_df[col] > upper_threshold, upper_threshold, X_batch_df[col])
    # Save the winsorized batch back to the list of batches as NumPy array
    X_batches_encoded[i] = X_batch_df.values

In [None]:
for i, X_batch in enumerate(X_batches_encoded):
    null_values = X_batch.isnull().sum()
    if null_values.TITLE>0 or null_values.DESCRIPTION>0:
      print(f"Null Values in X_batch {i}:\n", null_values)

AttributeError: ignored

In [None]:
import matplotlib.pyplot as plt

# Check for outliers in each batch of X using box plots
for i, X_batch in enumerate(X_batches_encoded[1:10]):
    plt.figure(figsize=(8, 6))
    X_batch.boxplot()
    plt.title(f"Outliers in X_batch {i}")
    plt.show()

AttributeError: ignored

<Figure size 800x600 with 0 Axes>

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler
scaler = MinMaxScaler()

# Scale the data in each batch using MinMaxScaler
X_batch_scaled= pd.DataFrame()
for X_batch in X_batches_encoded:
    batch_df = pd.DataFrame(X_batch)
    X_batch_scaled = batch_df.copy()
    df=X_batch_scaled.iloc[:,1]
    # X_batch_scaled.drop('PRODUCT_ID',axis=1)
    X_batch_scaled[batch_df.columns] = scaler.fit_transform(batch_df)
    X_batch_scaled['PRODUCT_ID']=df
    X_batches_encoded.append(X_batch_scaled)
    

TypeError: ignored

In [None]:
X_batches_scaled[1:2]

[      PRODUCT_ID     TITLE  BULLET_POINTS  DESCRIPTION  PRODUCT_TYPE_ID
 1000     1133536  0.434434       0.066343     0.912951         0.403956
 1001      782318  0.936937       1.000000     1.000000         0.063799
 1002     1901609  0.320320       0.532362     1.000000         0.224411
 1003      117538  0.634635       1.000000     1.000000         0.956134
 1004     2747804  0.994995       0.860841     1.000000         0.246037
 ...          ...       ...            ...          ...              ...
 1995     2275233  0.837838       1.000000     1.000000         0.007619
 1996     1966639  0.628629       0.561489     0.626327         0.619440
 1997     2220731  0.691692       0.686084     0.380042         0.107434
 1998     2757494  0.260260       0.006472     0.397028         0.253732
 1999     2424570  0.480480       0.720065     0.779193         0.219409
 
 [1000 rows x 5 columns]]

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create an instance of Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Loop through all the batches of X and y
for X_batch, y_batch in zip(X_batches_scaled, y_batches):
    # Check for null values in X_batch and y_batch
    if X_batch.isnull().sum().sum() == 0 and y_batch.isnull().sum() == 0:
        # Train the Random Forest model on the current batch
        rf.fit(X_batch, y_batch)


In [None]:
import pickle

filename = '/content/drive/MyDrive/Colab Notebooks/random_forest_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(rf, file)

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/dataset/test.csv')  # Load product data from CSV file

test_data.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


In [None]:
X=test_data

# Set the batch size
batch_size = 1000

# Split the dataset into batches
X_batches_test = []
y_batches_test = []

for i in range(0, len(X), batch_size):
    X_batch = X[i:i+batch_size]
    X_batches_test.append(X_batch)
    y_batches_test.append(y_batch)

In [None]:
# Instantiate LabelEncoder
encoder = LabelEncoder()

# Loop through X_batches and apply label encoding on each batch
X_batches_encoded_test = []
for X_batch in X_batches_test:
    X_batch_encoded = X_batch.copy()
    for column in X_batch.columns:
        if X_batch[column].dtype == 'object':
            X_batch_encoded[column] = encoder.fit_transform(X_batch[column])
    X_batches_encoded_test.append(X_batch_encoded)

In [None]:
X_batches_encoded_test[1:2]

[      PRODUCT_ID  TITLE  BULLET_POINTS  DESCRIPTION  PRODUCT_TYPE_ID
 1000     1899013    506            579          375            10350
 1001     2391123    285             90          181             3019
 1002      255202      6            610          470             6104
 1003     2623569    644            415          290              612
 1004     2836036    435            108          241             5989
 ...          ...    ...            ...          ...              ...
 1995     2777178    953            162           54             2986
 1996      577548    903            610          470              123
 1997     1127970     63            489          219             6003
 1998     1740584    570             92          470             8084
 1999     1018896     41            243          162             3484
 
 [1000 rows x 5 columns]]

In [None]:
# Instantiate MinMaxScaler
scaler = MinMaxScaler()

# Scale the data in each batch using MinMaxScaler
X_batches_scaled_test = []
for X_batch in X_batches_encoded_test:
    X_batch_scaled = X_batch.copy()
    X_batch_scaled[X_batch.columns] = scaler.fit_transform(X_batch)
    X_batches_scaled_test.append(X_batch_scaled)

In [None]:
X_batches_scaled_test[2:3]

[      PRODUCT_ID     TITLE  BULLET_POINTS  DESCRIPTION  PRODUCT_TYPE_ID
 2000    0.705159  0.383383       0.018779     1.000000         0.250819
 2001    0.481793  0.127127       1.000000     0.305155         0.017285
 2002    0.948843  0.590591       0.306729     0.709278         0.168355
 2003    0.961839  0.158158       0.921753     0.373196         0.728013
 2004    0.983032  0.958959       0.989045     0.230928         0.435925
 ...          ...       ...            ...          ...              ...
 2995    0.523631  0.157157       0.236307     1.000000         0.233686
 2996    0.851330  0.397397       1.000000     1.000000         0.000381
 2997    0.555597  0.506507       0.802817     0.360825         0.054291
 2998    0.241776  0.036036       1.000000     1.000000         0.471256
 2999    0.729869  0.791792       0.810642     1.000000         0.251428
 
 [1000 rows x 5 columns]]

# Predictions




In [None]:
test_data_for_id = pd.read_csv('/content/drive/MyDrive/dataset/test.csv')  # Load product data from CSV file

test_data_for_id.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


In [None]:
# Create an empty DataFrame to store the batch-wise predictions
predictions_df = pd.DataFrame(columns=['product_id', 'prediction'])


for i, X_batch in enumerate(X_batches_scaled_test):
    # Check for null values in X_batch
    if X_batch.isnull().sum().sum() == 0:
        # Make predictions on the current batch
        y_pred_batch = rf.predict(X_batch)
        # Get the corresponding product_ids from X_batch
        product_ids = X_batch['PRODUCT_ID'].values
        # Create a DataFrame with product_id and prediction columns
        batch_df = pd.DataFrame({'PRODUCT_ID': product_ids, 'PRODUCT_LENGTH': y_pred_batch})
        # Append the batch-wise predictions to the predictions_df
        predictions_df = predictions_df.append(batch_df, ignore_index=True)

# Save the predictions to a CSV file
predictions_df.to_csv('/content/drive/MyDrive/Colab Notebooks/predictions.csv', index=False)

  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = predictions_df.append(batch_df, ignore_index=True)
  predictions_df = p

In [None]:
# Create an empty DataFrame to store the batch-wise predictions
predictions_df = pd.DataFrame(columns=['product_id', 'prediction'])


for i, X_batch in enumerate(X_batches_scaled_test):
    # Check for null values in X_batch
    if X_batch.isnull().sum().sum() == 0:
        # Make predictions on the current batch
        y_pred_batch = rf.predict(X_batch)
        # Get the corresponding product_ids from X_batch
        product_ids = X_batch['PRODUCT_ID'].values
        # Create a DataFrame with product_id and prediction columns
        batch_df = pd.DataFrame({'PRODUCT_ID': test_data_for_id[i], 'PRODUCT_LENGTH': y_pred_batch})
        # Append the batch-wise predictions to the predictions_df
        predictions_df = predictions_df.append(batch_df, ignore_index=True)

# Save the predictions to a CSV file
predictions_df.to_csv('/content/drive/MyDrive/Colab Notebooks/predictions.csv', index=False)

In [None]:
# vectorizing the data

# Convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=5000)  # Instantiate a TF-IDF vectorizer with maximum 5000 features
X_text = data['TITLE'] + ' ' + data['DESCRIPTION'] + ' ' + data['BULLET_POINTS']  # Concatenate product title, description, and bullet points
X_text = X_text.fillna('')  # Fill missing values with empty string
X_text_features = vectorizer.fit_transform(X_text)  # Convert text data into numerical features


In [None]:
X_text_features

<2249698x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 103093021 stored elements in Compressed Sparse Row format>

In [None]:
# Perform data cleaning and feature engineering, e.g., handling missing values
# For example, you can use pandas functions to preprocess the data, such as:
# data = data.dropna()  # Remove rows with missing values



# Combine numerical features with other metadata
X_numerical = data.drop(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'PRODUCT_LENGTH'], axis=1)  # Drop text columns and product length column
X = pd.concat([X_numerical, pd.DataFrame(X_text_features.toarray(), columns=vectorizer.get_feature_names_out())], axis=1)  # Concatenate numerical features with text features

# Split data into training and testing sets
y = data['PRODUCT_LENGTH']  # Target variable (product length)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split data into 80% training and 20% testing sets


In [None]:

# Train a machine learning model
model = RandomForestRegressor()  # Instantiate a random forest regressor model
model.fit(X_train, y_train)  # Train the model on the training data

# Evaluate the model
y_pred_train = model.predict(X_train)  # Predict product lengths on training data
train_mape = mean_absolute_percentage_error(y_train, y_pred_train)  # Calculate mean absolute percentage error on training data
y_pred_test = model.predict(X_test)  # Predict product lengths on testing data
test_mape = mean_absolute_percentage_error(y_test, y_pred_test)  # Calculate mean absolute percentage error on testing data
print(f'Train MAPE: {train_mape:.2%}')
print(f'Test MAPE: {test_mape:.2%}')

# Calculate score using the provided formula
score = max(0, 100 * (1 - test_mape))
print(f'Score: {score:.2f}%')

# Model deployment and maintenance
# Once the model is deployed in a production environment, it can be used to make real-time predictions on new data. The model should be monitored and maintained periodically to ensure its accuracy and reliability, and updated as needed to account for changes in data or business requirements.
