In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from transformers import pipeline
from gensim.models import Word2Vec
from catboost import CatBoostRegressor

In [2]:
train_df = pd.read_csv('cleaned_train.csv')
test_df = pd.read_csv('cleaned_test.csv')

In [3]:
train_df['text']=train_df['text'].astype(str)
test_df['text']=test_df['text'].astype(str)
train_df = train_df.dropna()
train_df.head()

Unnamed: 0,PRODUCT_TYPE_ID,PRODUCT_LENGTH,text
0,1650,2125.98,opaque eyelets fading office curtainluxurious ...
1,2755,393.7,hedwig spencer 98 marks elastane harry exclusi...
2,7537,748.031495,pump 12v v compatible aluminum 130db vehicles ...
3,2996,787.401574,chart fabric lycra ankel may 2aishah pack legg...
4,5725,950.0,like 02cm home makes better holder planter all...


In [4]:
prod_id=test_df['PRODUCT_ID']

In [5]:
test_df.head()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,text
0,604373,6142,hliogravure relief traditions d1890 savoirs de...
1,1729783,1622,fleece low chart hand fabric microfiber medita...
2,1871949,7540,2018 2020 auto holder nx300 textured fit numbe...
3,1107571,12442,3mm includes lapel award andgold longtime pin ...
4,624253,6318,ti89 mathematics visual ti92 illustrated


In [6]:
test_df=test_df.drop('PRODUCT_ID',axis=1)

In [7]:
train_df.head()

Unnamed: 0,PRODUCT_TYPE_ID,PRODUCT_LENGTH,text
0,1650,2125.98,opaque eyelets fading office curtainluxurious ...
1,2755,393.7,hedwig spencer 98 marks elastane harry exclusi...
2,7537,748.031495,pump 12v v compatible aluminum 130db vehicles ...
3,2996,787.401574,chart fabric lycra ankel may 2aishah pack legg...
4,5725,950.0,like 02cm home makes better holder planter all...


In [8]:
import re
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

nltk.download('stopwords')

stop_words = stopwords.words('english')

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = [token for token in text.split() if token not in stop_words]
    return tokens

processed_docs_train = train_df['text'].apply(preprocess)
#processed_docs_test = test_df['text'].apply(preprocess)

[nltk_data] Downloading package stopwords to C:\Users\Antony
[nltk_data]     Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
processed_docs_test = test_df['text'].apply(preprocess)

In [10]:
type(processed_docs_train)

pandas.core.series.Series

In [11]:
import time

start_time = time.time()
model = Word2Vec(processed_docs_train, vector_size=100, window=5, min_count=1, workers=4)
end_time = time.time()
time_taken = end_time - start_time

print("Time taken:", time_taken, "seconds")

Time taken: 827.8860096931458 seconds


In [12]:
l = []
for sent in processed_docs_train:
  string = ' '.join(sent)
  l.append(string)
train_df['PROCESSED_TITLE'] = l



l = []
for sent in processed_docs_test:
  string = ' '.join(sent)
  l.append(string)
test_df['PROCESSED_TITLE'] = l

In [13]:
train_df.head()

Unnamed: 0,PRODUCT_TYPE_ID,PRODUCT_LENGTH,text,PROCESSED_TITLE
0,1650,2125.98,opaque eyelets fading office curtainluxurious ...,opaque eyelets fading office curtainluxurious ...
1,2755,393.7,hedwig spencer 98 marks elastane harry exclusi...,hedwig spencer 98 marks elastane harry exclusi...
2,7537,748.031495,pump 12v v compatible aluminum 130db vehicles ...,pump 12v v compatible aluminum 130db vehicles ...
3,2996,787.401574,chart fabric lycra ankel may 2aishah pack legg...,chart fabric lycra ankel may 2aishah pack legg...
4,5725,950.0,like 02cm home makes better holder planter all...,like 02cm home makes better holder planter all...


In [14]:
test_df.head()

Unnamed: 0,PRODUCT_TYPE_ID,text,PROCESSED_TITLE
0,6142,hliogravure relief traditions d1890 savoirs de...,hliogravure relief traditions d1890 savoirs de...
1,1622,fleece low chart hand fabric microfiber medita...,fleece low chart hand fabric microfiber medita...
2,7540,2018 2020 auto holder nx300 textured fit numbe...,2018 2020 auto holder nx300 textured fit numbe...
3,12442,3mm includes lapel award andgold longtime pin ...,3mm includes lapel award andgold longtime pin ...
4,6318,ti89 mathematics visual ti92 illustrated,ti89 mathematics visual ti92 illustrated


In [15]:
train_df=train_df.drop('text',axis=1)
test_df=test_df.drop('text',axis=1)

In [16]:
import numpy as np
def sentence_embedding(sentence):
    words = [word.strip(",.") for word in sentence.split()]
    #wlist = preprocess(sentence)
    vectors = [model.wv[word] for word in words if word in model.wv.key_to_index]
    if len(vectors) == 0:
         return np.zeros(model.vector_size)
    else:
         return np.mean(vectors, axis=0)

In [17]:
train_df['TITLE_EMBED'] = train_df['PROCESSED_TITLE'].apply(sentence_embedding)
test_df['TITLE_EMBED'] = test_df['PROCESSED_TITLE'].apply(sentence_embedding)

In [18]:
train_df.head(100)

Unnamed: 0,PRODUCT_TYPE_ID,PRODUCT_LENGTH,PROCESSED_TITLE,TITLE_EMBED
0,1650,2125.980000,opaque eyelets fading office curtainluxurious ...,"[0.3200476, -0.37408897, -0.21521586, 0.453634..."
1,2755,393.700000,hedwig spencer 98 marks elastane harry exclusi...,"[0.84529656, -0.47621465, -0.6264052, -1.06366..."
2,7537,748.031495,pump 12v v compatible aluminum 130db vehicles ...,"[-0.4407191, 0.4794889, 0.12400802, 0.75253576..."
3,2996,787.401574,chart fabric lycra ankel may 2aishah pack legg...,"[-0.093169995, -1.0705742, 0.034363437, 0.0701..."
4,5725,950.000000,like 02cm home makes better holder planter all...,"[-0.7195843, -0.39701036, -0.16367403, 0.29022..."
...,...,...,...,...
95,13121,118.110236,safety safety safe citrus art prolonged mandol...,"[-0.6474767, -0.051752068, -0.33241257, -0.326..."
96,5904,1023.622046,pool holder planter hall floor pegs art powder...,"[-0.20969716, -0.18236229, -0.22812472, 0.2845..."
97,12064,393.700787,stylish mobile cases non toxic case environmen...,"[-0.83922184, -1.2259954, 0.18773763, 0.631459..."
98,2703,472.440944,shoulders regular wear free tees crew comforta...,"[-0.2297708, -2.0455234, -0.27429676, -0.30159..."


In [21]:
y_train=train_df['PRODUCT_LENGTH'].astype(int)
X_train=train_df.drop(columns=['PROCESSED_TITLE','PRODUCT_LENGTH'])

In [22]:
test_df.head()

Unnamed: 0,PRODUCT_TYPE_ID,PROCESSED_TITLE,TITLE_EMBED
0,6142,hliogravure relief traditions d1890 savoirs de...,"[0.17557926, 0.54150605, -0.29175466, -0.19448..."
1,1622,fleece low chart hand fabric microfiber medita...,"[-0.43800655, -1.3380965, 0.47476885, -0.45230..."
2,7540,2018 2020 auto holder nx300 textured fit numbe...,"[0.017787194, 0.3894162, -0.08090764, 1.026442..."
3,12442,3mm includes lapel award andgold longtime pin ...,"[-0.39270768, -0.41911897, 0.13909887, 0.61163..."
4,6318,ti89 mathematics visual ti92 illustrated,"[0.5423145, 0.8735342, 0.017060176, -0.6309894..."


In [23]:
test_df=test_df.drop('PROCESSED_TITLE',axis=1)

In [24]:
append_func = lambda row: np.append(row['TITLE_EMBED'], row['PRODUCT_TYPE_ID'])

# apply the lambda function to the 'numpy_array' column using the apply method
X_train['appended_array'] = train_df.apply(append_func, axis=1)
test_df['appended_array'] = test_df.apply(append_func, axis=1)

In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1184066 entries, 0 to 1184065
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   PRODUCT_TYPE_ID  1184066 non-null  int64 
 1   TITLE_EMBED      1184066 non-null  object
 2   appended_array   1184066 non-null  object
dtypes: int64(1), object(2)
memory usage: 36.1+ MB


In [29]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734736 entries, 0 to 734735
Data columns (total 3 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   PRODUCT_TYPE_ID  734736 non-null  int64 
 1   TITLE_EMBED      734736 non-null  object
 2   appended_array   734736 non-null  object
dtypes: int64(1), object(2)
memory usage: 16.8+ MB


In [30]:
X_train=X_train['appended_array'].to_numpy()
test_df=test_df['appended_array'].to_numpy()

In [31]:
X_train=np.stack(X_train)
test_df=np.stack(test_df)

In [32]:
X_train[0].size

101

In [33]:
test_df.shape

(734736, 101)

In [34]:
len(prod_id)

734736

In [35]:
from sklearn.neighbors import KNeighborsClassifier 
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=3)

In [None]:
y_pred = knn.predict(test_df)

In [None]:
submission_df = pd.DataFrame({'PRODUCT_ID': prod_id, 'PRODUCT_LENGTH': y_pred})

In [None]:
submission_df.set_index('PRODUCT_ID',inplace=True)
submission_df.to_csv('sample_submission10.csv')