In [1]:
import pandas as pd
import numpy as np
import nltk

# Loading Data

In [2]:
train=pd.read_csv("dataset/train.csv")
test=pd.read_csv("dataset/test.csv")
print(train.shape)
print(test.shape)

(2249698, 6)
(734736, 5)


In [3]:
train=train.iloc[:1000000,:]
test=test.iloc[:30000,:]

In [4]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   PRODUCT_ID       1000000 non-null  int64  
 1   TITLE            999995 non-null   object 
 2   BULLET_POINTS    627877 non-null   object 
 3   DESCRIPTION      485344 non-null   object 
 4   PRODUCT_TYPE_ID  1000000 non-null  int64  
 5   PRODUCT_LENGTH   1000000 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 45.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   PRODUCT_ID       30000 non-null  int64 
 1   TITLE            30000 non-null  object
 2   BULLET_POINTS    18765 non-null  object
 3   DESCRIPTION      14500 non-null  object
 4   PRODUCT_TYPE_ID  30000 non-null  int64 
dtypes: int64(

In [5]:
print(train.isna().sum())
print(test.isna().sum())

PRODUCT_ID              0
TITLE                   5
BULLET_POINTS      372123
DESCRIPTION        514656
PRODUCT_TYPE_ID         0
PRODUCT_LENGTH          0
dtype: int64
PRODUCT_ID             0
TITLE                  0
BULLET_POINTS      11235
DESCRIPTION        15500
PRODUCT_TYPE_ID        0
dtype: int64


# Data Cleaning

In [6]:
train=train.fillna("")
test=test.fillna("")
print(train.isna().sum())
print(test.isna().sum())

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64
PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
dtype: int64


In [7]:
train2=train.drop("PRODUCT_TYPE_ID", axis=1)
train2=train2.drop("PRODUCT_LENGTH", axis=1)
test2=test.drop("PRODUCT_TYPE_ID", axis=1)
print(train2.shape)
print(test2.shape)

(1000000, 4)
(30000, 4)


In [8]:
target=train.PRODUCT_LENGTH
print(target)

0         2125.980000
1          393.700000
2          748.031495
3          787.401574
4          598.424000
             ...     
999995     225.000000
999996     650.000000
999997     118.750000
999998    5905.511805
999999      13.779500
Name: PRODUCT_LENGTH, Length: 1000000, dtype: float64


In [9]:
import string
import re
def clean(text):
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text=re.sub('[^a-zA-Z\s\d]','', text)
    text = re.sub('\n', '', text)
    text = re.sub('<p><strong>', '', text)
    text = re.sub('<break>', '', text)
    #print(text)
    return text

def clean_bullets(text):
    text = str(text).lower()
    new_text=re.sub('[^a-zA-Z\s\d]','', text)
    #print(new_text)
    return new_text
#train3=pd.DataFrame({})
#test3=pd.DataFrame({})

train2["TITLE"]=train2["TITLE"].apply(clean)
train2["BULLET_POINTS"]=train2["BULLET_POINTS"].apply(clean_bullets)
train2["DESCRIPTION"]=train2["DESCRIPTION"].apply(clean)

test2["TITLE"]=test2["TITLE"].apply(clean)
test2["BULLET_POINTS"]=test2["BULLET_POINTS"].apply(clean_bullets)
test2["DESCRIPTION"]=test2["DESCRIPTION"].apply(clean)

In [10]:
train2

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION
0,1925202,artzfolio tulip flowers blackout curtain for d...,luxurious appealing beautiful custommade curt...,
1,2673191,marks spencer girls pyjama sets t862561cnavy ...,harry potter hedwig pyjamas 616 yrs100 cotton ...,
2,2765088,priknik horn red electric air horn compressor ...,loud dual tone trumpet horn compatible with sx...,specifications color red material aluminium vo...
3,1594019,alishah womens cotton ankle length leggings co...,made by 95cotton and 5 lycra which gives you 4...,aishah womens lycra cotton ankel leggings bran...
4,283658,the united empire loyalists a chronicle of the...,,
...,...,...,...,...
999995,949151,alltrade 948002 power steering pump pulley kit...,made to be the most reliable tools in the mark...,
999996,157728,the graduates predictionary a safedeposit box ...,,
999997,1152059,lechat 3 rolls holographic nail striping tapes...,,lechat dare to wear nail polish fizzy apple 05...
999998,2871659,sweet homes 25 inchess luxury shag collection ...,25 inchess pile height and ultra fluffy thickn...,super soft fluffy rug uses high density piles ...


In [11]:
test2

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION
0,604373,manuel dhliogravure et de photogravure en reli...,,
1,1729783,dcgaring microfiber throw blanket warm fuzzy p...,quality guaranteed luxury cozy plush polyester...,dcgaring throw blanket size chart w x l twin...
2,1871949,imatch auto parts front license plate bracket ...,front license plate bracket made of plasticdir...,replacement for the following vehicles2020 lex...
3,1107571,pinmart gold plated excellence in service 1 ye...,available as a single item or bulk packed sele...,our excellence in service lapel pins feature a...
4,624253,visual mathematics illustrated by the ti92 and...,,
...,...,...,...,...
29995,1604180,good cushion 4202 bamboo utensil drawer organi...,get organized with this perfect home solutionn...,
29996,89079,life of josiah quincy of massachusetts,,
29997,1173553,m10x55mm thread 80mm long lever lathe adjustab...,weight 328g product name adjustable clamping h...,features spring loaded bolt highimpact antiaci...
29998,532202,angie lewin wall calendar 2017 art calendar,,


In [12]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

def tokenize(text):
    new_text = ''.join([i for i in text if not i.isdigit()])
    #new_text = ' '.join([word for word in new_text if len(word) > 10])
    word_tokens=word_tokenize(new_text)
    #print(word_tokens)
    return word_tokens

stop_words = set(stopwords.words('english')) 
def stopword_removal(text):
    filtered_sentence = [] 
    for w in text:
        if w not in stop_words: 
            filtered_sentence.append(w)
    #print(filtered_sentence)
    return filtered_sentence
#train4=pd.DataFrame({})
#test4=pd.DataFrame({})
train2["TITLE"]=train2["TITLE"].apply(tokenize)
train2["TITLE"]=train2["TITLE"].apply(stopword_removal)
train2["BULLET_POINTS"]=train2["BULLET_POINTS"].apply(tokenize)
train2["BULLET_POINTS"]=train2["BULLET_POINTS"].apply(stopword_removal)
train2["DESCRIPTION"]=train2["DESCRIPTION"].apply(tokenize)
train2["DESCRIPTION"]=train2["DESCRIPTION"].apply(stopword_removal)

test2["TITLE"]=test2["TITLE"].apply(tokenize)
test2["TITLE"]=test2["TITLE"].apply(stopword_removal)
test2["BULLET_POINTS"]=test2["BULLET_POINTS"].apply(tokenize)
test2["BULLET_POINTS"]=test2["BULLET_POINTS"].apply(stopword_removal)
test2["DESCRIPTION"]=test2["DESCRIPTION"].apply(tokenize)
test2["DESCRIPTION"]=test2["DESCRIPTION"].apply(stopword_removal)


In [13]:
train2.TITLE

0         [artzfolio, tulip, flowers, blackout, curtain,...
1         [marks, spencer, girls, pyjama, sets, tcnavy, ...
2         [priknik, horn, red, electric, air, horn, comp...
3         [alishah, womens, cotton, ankle, length, leggi...
4         [united, empire, loyalists, chronicle, great, ...
                                ...                        
999995    [alltrade, power, steering, pump, pulley, kit,...
999996    [graduates, predictionary, safedeposit, box, h...
999997    [lechat, rolls, holographic, nail, striping, t...
999998    [sweet, homes, inchess, luxury, shag, collecti...
999999    [diamond, wish, k, yellow, gold, single, princ...
Name: TITLE, Length: 1000000, dtype: object

In [14]:
test2

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION
0,604373,"[manuel, dhliogravure, et, de, photogravure, e...",[],[]
1,1729783,"[dcgaring, microfiber, throw, blanket, warm, f...","[quality, guaranteed, luxury, cozy, plush, pol...","[dcgaring, throw, blanket, size, chart, w, x, ..."
2,1871949,"[imatch, auto, parts, front, license, plate, b...","[front, license, plate, bracket, made, plastic...","[replacement, following, vehicles, lexus, nxh,..."
3,1107571,"[pinmart, gold, plated, excellence, service, y...","[available, single, item, bulk, packed, select...","[excellence, service, lapel, pins, feature, mm..."
4,624253,"[visual, mathematics, illustrated, ti, ti]",[],[]
...,...,...,...,...
29995,1604180,"[good, cushion, bamboo, utensil, drawer, organ...","[get, organized, perfect, home, solutionnatura...",[]
29996,89079,"[life, josiah, quincy, massachusetts]",[],[]
29997,1173553,"[mxmm, thread, mm, long, lever, lathe, adjusta...","[weight, g, product, name, adjustable, clampin...","[features, spring, loaded, bolt, highimpact, a..."
29998,532202,"[angie, lewin, wall, calendar, art, calendar]",[],[]


In [15]:
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemma_word = ""
    wordnet_lemmatizer = WordNetLemmatizer()
    for w in text:
        word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
        word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
        word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
        lemma_word=" ".join([lemma_word,word3])
    #print(lemma_word)
    return lemma_word

train2.TITLE=train2.TITLE.apply(lemmatize_text)
train2.BULLET_POINTS=train2.BULLET_POINTS.apply(lemmatize_text)
train2.DESCRIPTION=train2.DESCRIPTION.apply(lemmatize_text)

test2.TITLE=test2.TITLE.apply(lemmatize_text)
test2.BULLET_POINTS=test2.BULLET_POINTS.apply(lemmatize_text)
test2.DESCRIPTION=test2.DESCRIPTION.apply(lemmatize_text)


In [16]:
print(train2.shape)
print(test2.shape)

(1000000, 4)
(30000, 4)


In [17]:
train2=train2.TITLE+train2.BULLET_POINTS+train2.DESCRIPTION
test2=test2.TITLE+test2.BULLET_POINTS+test2.DESCRIPTION
print(train2.shape)
print(test2.shape)

(1000000,)
(30000,)


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
transformed_train=TfidfVectorizer(max_df=0.7).fit_transform(train2)
transformed_test=TfidfVectorizer(max_df=0.7).fit_transform(test2)


In [19]:
print(transformed_train.shape)
print(transformed_test.shape)

(1000000, 1615799)
(30000, 135318)


In [20]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()



In [None]:
model.fit(transformed_train,target)
print(model.score(transformed_train,target))
import pickle
pickle.dump(model, open("amlp.sav", 'wb'))

In [None]:
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt

train_pred=model.predict(transformed_train)
train_errors = mean_squared_error(target, train_pred)
'''for i in range(len(target)):
    # calculate error
    err = (target[i] - y_pred[i])**2
    # store error
    errors.append(err)
    # report error
    #print('>%.1f, %.1f = %.3f' % (target[i], y_pred[i], err))
# plot errors'''
plt.plot(train2.index,train_pred)

#plt.xticks(ticks=[i for i in range(len(errors))], labels=y_pred)
plt.xlabel('Product_Index')
plt.ylabel('Predicted_Value')
plt.show()

In [None]:
test_pred=model.predict(transformed_test)
test_errors=mean_squared_error(target, test_pred)

plt.plot(test2.index,test_pred)
#plt.xticks(ticks=[i for i in range(len(errors))], labels=y_pred)
plt.xlabel('Product_Index')
plt.ylabel('Predicted_Value')
plt.show()

In [None]:
y_pred

In [None]:
target