In [1]:
import datasets
import numpy as np
import os
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.optim as optim

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.linear_model import LinearRegression

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def group_by_product_id(df):
    """
    Group a dataframe by PRODUCT_TYPE_ID and return a dictionary with PRODUCT_TYPE_ID as key and an
    array of the dataframe with that product_id as value.
    
    Parameters:
        df (pandas.DataFrame): the dataframe to group
        
    Returns:
        dict: a dictionary with product_id as key and an array of the dataframe with that product_id as value
    """
    result = {}
    for product_id in df['PRODUCT_TYPE_ID'].unique():
        result[product_id] = df[df['PRODUCT_TYPE_ID'] == product_id]
    return result

def clean_data(df):
    # remove HTML tags from the text
    df['TEXT'] = df['TEXT'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

    # remove any non-alphanumeric characters from the text
    df['TEXT'] = df['TEXT'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\*\'\"\s]', '', x))

    # convert all text to lowercase
    df['TEXT'] = df['TEXT'].apply(lambda x: x.lower())

    # tokenize the text
    df['TEXT'] = df['TEXT'].apply(lambda x: x.split())

    # remove stop words using NLTK
    stop_words = set(stopwords.words('english'))
    df['TEXT'] = df['TEXT'].apply(lambda x: [word for word in x if word not in stop_words])

    # remove words with fewer than three characters
    df['TEXT'] = df['TEXT'].apply(lambda x: [word for word in x if len(word) > 2])

    # join the tokens back into a single string
    df['TEXT'] = df['TEXT'].apply(lambda x: ' '.join(x))

    return df

def preprocess(df):
    df = df.fillna("")
    df["BULLET_POINTS"] = df["BULLET_POINTS"].apply(lambda x: " ".join(str(x).strip()[1 : -1].split(",")))
    df["TEXT"] = df["TITLE"] + " " + df["BULLET_POINTS"] + " " + df["DESCRIPTION"]
#     df = clean_data(df)
    return df

In [3]:
DATASET_FOLDER = "/Users/software/Desktop/amazon-ml-challenge/dataset"
train_file = os.path.join(DATASET_FOLDER, "train.csv")
test_file = os.path.join(DATASET_FOLDER, "test.csv")
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [4]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)


In [5]:
train_df

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH,TEXT
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,LUXURIOUS & APPEALING: Beautiful custom-made c...,,1650,2125.980000,ArtzFolio Tulip Flowers Blackout Curtain for D...
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,Harry Potter Hedwig Pyjamas (6-16 Yrs) 100% co...,,2755,393.700000,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,Loud Dual Tone Trumpet Horn Compatible With S...,"Specifications: Color: Red, Material: Aluminiu...",7537,748.031495,PRIKNIK Horn Red Electric Air Horn Compressor ...
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,Made By 95%cotton and 5% Lycra which gives you...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574,ALISHAH Women's Cotton Ankle Length Leggings C...
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424000,The United Empire Loyalists: A Chronicle of th...
...,...,...,...,...,...,...,...
2249693,2422167,Nike Women's As W Ny Df Swsh Hn Kh Bra (CZ7610...,aterial : Polyeste,,3009,1181.100000,Nike Women's As W Ny Df Swsh Hn Kh Bra (CZ7610...
2249694,2766635,"(3PCS) Goose Game Cute Cartoon Enamel Pins, Fu...",❤ [Inspiration] Inspired by the Untitled Goose...,<p><b>[Brand]: </b>XVIEONR</p> <p><br></p> <p>...,3413,125.984252,"(3PCS) Goose Game Cute Cartoon Enamel Pins, Fu..."
2249695,1987786,Kangroo Sweep Movement Printed Wooden Wall Clo...,Dial size: 12 inches in diameter Big clear re...,Wall Clocks Are Very Attractive In Looks And E...,1574,1200.000000,Kangroo Sweep Movement Printed Wooden Wall Clo...
2249696,1165754,Electro Voice EKX-BRKT15 | Wall Mount Bracket ...,,,592,2900.000000,Electro Voice EKX-BRKT15 | Wall Mount Bracket ...


In [8]:
train_df.to_csv("csv.csv")

In [7]:
df = train_df
matches = df['BULLET_POINTS'].str.extract(dimension_pattern)

NameError: name 'dimension_pattern' is not defined

In [None]:
matches

In [None]:
df = df.join(matches)
df

In [None]:
df[8] = df[8].fillna('in')
df.loc[df[8].str.contains('in'), 0] = df.loc[df[8].str.contains('in'), 0].apply(lambda x: float(x) * 100)
df.loc[df[8].str.contains('cm'), 0] = df.loc[df[8].str.contains('cm'), 0].apply(lambda x: float(x) / 2.54 * 100)
df.loc[df[8].str.contains('mm'), 0] = df.loc[df[8].str.contains('mm'), 0].apply(lambda x: float(x) / 25.4 * 100)

In [None]:
groups = group_by_product_id(train_df)
groups = list(groups.items())
groups.sort(key = lambda x: len(x[1]), reverse = True)
type_to_mean = {}
for key, value in groups:
    type_to_mean[str(key)] = value["PRODUCT_LENGTH"].median()
answer = {}
for i in range(len(test_df)):
    data = test_df.iloc[i]
    pid = data["PRODUCT_ID"]
    key = str(data["PRODUCT_TYPE_ID"])
    value = 1
    if key in type_to_mean.keys():
        value = type_to_mean[key]
    answer[pid] = value

In [None]:
answer_df = pd.DataFrame(answer.items(), columns = ["PRODUCT_ID", "PRODUCT_LENGTH"])

In [None]:
# Merge df1 and df2 on the "PRODUCT_ID" column
merged_df = pd.merge(answer_df[["PRODUCT_ID", "PRODUCT_LENGTH"]], df[["PRODUCT_ID", 0]], on="PRODUCT_ID", how="left")

# Replace missing values in "PRODUCT_LENGTH_x" with the corresponding values from "PRODUCT_LENGTH_y"
merged_df[0].fillna(merged_df["PRODUCT_LENGTH"], inplace=True)

# Drop the "PRODUCT_LENGTH_y" column
merged_df.drop(columns=["PRODUCT_LENGTH"], inplace=True)

# Rename the "PRODUCT_LENGTH_x" column to "PRODUCT_LENGTH"
merged_df.rename(columns={0: "PRODUCT_LENGTH"}, inplace=True)

# Update the "PRODUCT_LENGTH" column in df1 with the values from the merged dataframe
answer_df["PRODUCT_LENGTH"] = merged_df["PRODUCT_LENGTH"]

In [None]:
answer_df

In [None]:
sample_df = pd.read_csv("../dataset/sample_submission.csv")
real_answer = {}
factor = 0.95

for i in range(len(answer_df)):
    from_sample = sample_df.iloc[i]
    from_ours = answer_df.iloc[i]
    v1 = from_sample["PRODUCT_LENGTH"]
    v2 = float(from_ours["PRODUCT_LENGTH"])
    pid = int(from_sample["PRODUCT_ID"])
    if v2 >= v1 + factor * v1 or v2 <= v1 - factor * v1:
        real_answer[pid] = v1
    else:
        real_answer[pid] = v2

real_answer_df = pd.DataFrame(real_answer.items(), columns = ["PRODUCT_ID", "PRODUCT_LENGTH"])
real_answer_df.to_csv("submission.csv", index = False)

In [None]:
real_answer_df

In [None]:
text = list(train_df["TEXT"])
print(text)


In [None]:
text = list(train_df["TEXT"])
import spacy
nlp = spacy.load("en_core_web_sm")
ents=[]
for string in text:
    doc = nlp(string)
