In [4]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from wordcloud import  STOPWORDS
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

In [5]:
# LOADING THE  TRAIN DATASET
df =  pd.read_csv(r"C:\Users\Msys\Desktop\Data Science-GSG\train.tsv",index_col=["train_id"],sep='\t')
#df.head()

In [6]:
# LOADING THE TEST DATASET
df_test  =pd.read_csv(r"C:\Users\Msys\Desktop\Data Science-GSG\test.tsv",index_col=["test_id"],sep='\t')
#df_test

# EDA

## Basic Statictics

In [7]:
#GETTING THE SHAPE OF THE DATASET
df.shape

(1482535, 7)

In [8]:
#GETTING THE BASIC INFO ABOUT THE DATASET
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482535 entries, 0 to 1482534
Data columns (total 7 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   name               1482535 non-null  object 
 1   item_condition_id  1482535 non-null  int64  
 2   category_name      1476208 non-null  object 
 3   brand_name         849853 non-null   object 
 4   price              1482535 non-null  float64
 5   shipping           1482535 non-null  int64  
 6   item_description   1482531 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 90.5+ MB


## Preprocessing

### 'price'

In [9]:
#THIS CELL CREATES NEW COLOMN WITH LOG(PRICE+1)
df["log_price"] = df.price.apply(lambda x:np.log(x+1))
#df.head()

### 'name'

In [10]:
#THIS CELL PREPROCESSES THE NAME FEATURE

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

st_words = stopwords.words('english')

def name_process(text):
    '''THIS FUNCTION IS USED TO PREPROCESS THE NAME FEATURE'''
    text = decontracted(text)
    text = re.sub("[^A-Za-z0-9 ]","",text) # REMOVE EVERYTHING EXCEPT THE PROVIDED CHARACTERS
    text = text.lower() # CONVERT TO LOWER CASE
    text =  " ".join([i for i in text.split() if i not in st_words])
    if len(text)==0:
        text = "missing"
    return text # RETURN THE OUTPUT TEXT

In [11]:
# APPLYING THE "preprocessing" FUNCTION ON THE FEAUTRE "name"
df["name_processed"] = df.name.apply(name_process)
df[df.name_processed.isnull()].name_processed ="missing" 

# APPLYING THE "preprocessing" FUNCTION ON THE FEAUTRE "name" on test data
df_test["name_processed"] = df_test.name.apply(name_process)
df_test[df_test.name_processed.isnull()].name_processed ="missing" 

### brand_name

In [12]:
'''CREATING PREPROCESSING FUNCTION FOR BRAND NAME'''
def brand_process(text):
    text = re.sub("[^A-Za-z0-9 ]","",text)# REMOVE EVERYTHING EXCEPT THE PROVIDED CHARACTERS
    text = text.lower()  # CONVERT TO LOWER CASE
    return text

In [13]:
# here we are assigning score correrponding to each brand_name
# the easiest way is to assign the score equals to number of occurences for that brand and store in the form of dict.
brand_score = dict(df[df.brand_name.notnull()]["brand_name"].apply(brand_process).value_counts())

processed_brand_name = [] #storing the barand name after preprocessing
for index,i in tqdm(df.iterrows()) : # for each row in the dataset
    
    if  pd.isnull(i.brand_name): #if the brand name isnull we follow this
        
        words = i.name_processed.split() # we will split the name for that datapoint
        score  = [] # this variable stores the score for each word that we calculated above
        for j in words: # for each word 
            if j in brand_score.keys(): #if the words in name is present in the keys of brand score dict
                score.append(brand_score[j]) # take the score from the dict and append in the score variable
            else: #if the word is not a brand name append -1
                score.append(-1)
        # once we get the scores for all the words in the name the word with maximum score woulb be the brand name
        if max(score) > 0: #if the maximum score is greater than 0 then it contains a brand name so we append the brand name
            processed_brand_name.append(words[score.index(max(score))])
        else: # if maximum value is less than 0 then it means no brand name was found so "missing" is appended
            processed_brand_name.append("missing")
            
    else: # if the brand_name is not null we follow this
        processed_brand_name.append(brand_process(i.brand_name))

1482535it [02:27, 10075.94it/s]


In [14]:
#CREATING NEW COLUMN WITH PROCESSED BRAND NAMES
df["brand_name_processed"] = processed_brand_name

In [15]:
#APPLYING PROCESSING TO TEST DATASET BRAND NAME
processed_brand_name_test = []
for index,i in tqdm(df_test.iterrows()) :
    
    if  pd.isnull(i.brand_name):
        
        words = i.name_processed.split()
        score  = []
        for j in words:
            if j in brand_score.keys():
                score.append(brand_score[j])
            else:
                score.append(-1)
        
        if max(score) > 0:
            processed_brand_name_test.append(words[score.index(max(score))])
        else:
            processed_brand_name_test.append("missing")
    else:
        processed_brand_name_test.append(brand_process(i.brand_name))

693359it [01:08, 10056.83it/s]


In [16]:
#CREATING NEW COLUMN WITH PROCESSED BRAND NAMES
df_test["brand_name_processed"] = processed_brand_name_test

### category_name

In [17]:
# PREPROCESS THE  "category_name" 

def category_name_preprocessing(text):
    #THIS FUNCTION PREPROCESSES THE TEXT IN "category_name" FEATURE
    text = re.sub("[^A-Za-z0-9/ ]","",text)# REMOVING ALL THE TEXT EXCEPT THE GIVEN CHARACTERS
    text = re.sub("s "," ",text) # REMOVING  "s" AT THE END OF THE WORD
    text = re.sub("s/","/",text) # REMOVING  "s" AT THE END OF THE WORD
    text = re.sub("  "," ",text) # REMOVING ONE SPACE WHERE TWO SPACES ARE PRESENT
    text = text.lower() # CONVERTING THE TEXT TO LOWER CASE
    return text # RETURNING THE PROCESSED TEXT

In [18]:
#THIS CELL REPLACE THE NULL VALUES WITH WORD "missing" AND PREPROCESSES THE category_name FEATURE

# HERE WE ARE REPLACING THE NULL VALUES IN "category_name" WITH WORD "missing"
df.category_name[df.category_name.isnull()] = "missing"
# HERE WE ARE PREPROCESSING THE TEXT IN "category_name"
df["category_name_preprocessed"] = df.category_name.apply(category_name_preprocessing)

In [19]:
# PREPROCESSING ON TEST DATA
df_test.category_name[df_test.category_name.isnull()] = "missing"
df_test["category_name_preprocessed"] = df_test.category_name.apply(category_name_preprocessing)

## Division of category_name
### 'Tier_1'


In [20]:
#THIS CELL SPLITS THE TOTAL CATEGORY NAME WITH "/" AND WHERE NULL IS PRESENT "missing" IS USED

# FORMING A COLUMN "Tier_1"
df["Tier_1"] = df.category_name_preprocessed.apply(lambda x:   x.split("/")[0] if len(x.split("/"))>=1 else "missing")

# PREPROCESSING ON TEST DATA
df_test["Tier_1"] = df_test.category_name_preprocessed.apply(lambda x:   x.split("/")[0] if len(x.split("/"))>=1 else "missing")


### 'Tier_2'

In [21]:
# FORMING A COLUMN "Tier_2"
df["Tier_2"] = df.category_name_preprocessed.apply(lambda x:   x.split("/")[1] if len(x.split("/"))>1 else "missing")
# PREPROCESSING ON TEST DATA
df_test["Tier_2"] = df_test.category_name_preprocessed.apply(lambda x:   x.split("/")[1] if len(x.split("/"))>1 else "missing")

### 'Tier_3'

In [22]:
# FORMING A COLUMN "Tier_3"
df["Tier_3"] = df.category_name_preprocessed.apply(lambda x:   x.split("/")[2] if len(x.split("/"))>1 else "missing")
# PREPROCESSING ON TEST DATA
df_test["Tier_3"] = df_test.category_name_preprocessed.apply(lambda x:   x.split("/")[2] if len(x.split("/"))>1 else "missing")

## 'item_description'

In [23]:
#PREPROCESSING FUNCTION FOR ITEM DESCRIPTION
def processing_item_description(text):
    '''THIS FUNCTION PREPROCESSES THE TEXT IN "item_description"'''
    text = re.sub("\[rm\] ","",str(text))
    text = decontracted(text)
    text = re.sub("[^A-Za-z0-9 ]","",str(text))
    text = str(text).lower()
    text =  " ".join([i for i in text.split() if i not in st_words])
    if len(text)==0:
        text = "missing"
    return text

In [24]:
#REPLACING THE NULL VALUVE WITH WORD "missing"
df.item_description[df.item_description.isnull()]="missing"

#HERE WE ARE PREPROCESSING THE TEXT IN FEATURE "item_description" '''
df["processed_item_description"] = df.item_description.apply(processing_item_description)

In [25]:
# PREPROCESSING FOR TEST DATA
df_test.item_description[df_test.item_description.isnull()]="missing"
df_test["processed_item_description"] = df_test.item_description.apply(processing_item_description)

### PREPROCESSING FOR WORD CLOUD

In [23]:
#THIS CELL JOINS ALL THE NAMES OF SPECIFIC ITEM CONDITON TOGETHER
# PLACEHOLDERS
id_1=""
id_2=""
id_3=""
id_4=""
id_5=""
# FOR EACH ROW IN DATASET
for index ,i in tqdm(df[["name_processed","item_condition_id"]].iterrows()):
    # JOIN THE STRIG TO SPECIFIC ITEM CONDITION ID
    if i.item_condition_id==1:
        id_1+= i.name_processed
    if i.item_condition_id==2:
        id_2+= i.name_processed
    if i.item_condition_id==3:
        id_3+= i.name_processed
    if i.item_condition_id==4:
        id_4+= i.name_processed
    if i.item_condition_id==5:
        id_5+= i.name_processed
    

1482535it [03:25, 7223.69it/s]


In [24]:
# STORING THE STOPWORDS IN A VARIABLE
stopword = stopwords.words('english')

#FORMING WORDCLOUDS
wordcloud1 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_1)
wordcloud2 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_2)
wordcloud3 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_3)
wordcloud4 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_4)
wordcloud5 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_5)

### 10. item_description

### World cloud Analysis For Item Description 

In [25]:
#JOINS ALL THE ITEM DESCRIPTION OF SPECIFIC ITEM CONDITON TOGETHER

# PLACEHOLDER 
id_1_desc=""
id_2_desc=""
id_3_desc=""
id_4_desc=""
id_5_desc=""
# FOR EACH POINTS IN THE DATASET
for index ,i in tqdm(df[["processed_item_description","item_condition_id"]].iterrows()):
    # JOINGING THE SRTING BASED ON ITEM CONDITON ID
    if i.item_condition_id==1:
        id_1_desc += i.processed_item_description
    if i.item_condition_id==2:
        id_2_desc  += i.processed_item_description
    if i.item_condition_id==3:
        id_3_desc  += i.processed_item_description
    if i.item_condition_id==4:
        id_4_desc  += i.processed_item_description
    if i.item_condition_id==5:
        id_5_desc  += i.processed_item_description
    

1482535it [13:27, 1836.02it/s]


In [26]:
#FORMING THE WORD CLOUDS

wordcloud1_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_1_desc)
wordcloud2_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_2_desc)
wordcloud3_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_3_desc)
wordcloud4_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_4_desc)
wordcloud5_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_5_desc)

In [27]:
# STORING THE CSV FILE 
df.to_csv("train_processed.csv")

In [38]:
# STORING THE CSV FILE 
df_test.to_csv("test_processed.csv")

# Splitting

In [26]:
df_train,df_val = train_test_split(df,test_size=0.1,random_state = 3) 

In [27]:
print("size of df_train : ",df_train.shape)
print("size of df_val :",df_val.shape)
print("size of df_test :",df_test.shape)

size of df_train :  (1334281, 15)
size of df_val : (148254, 15)
size of df_test : (693359, 13)


### Value of Target variables

In [28]:
y_train = df_train.log_price
y_val = df_val.log_price

### 1.  item_condition_id

In [29]:
#ITEM CONDITION ID ONE HOT ENDODING
train_vec_item_con = pd.get_dummies(df_train.item_condition_id).values
val_vec_item_con = pd.get_dummies(df_val.item_condition_id).values
test_vec_item_con = pd.get_dummies(df_test.item_condition_id).values

print(train_vec_item_con.shape)
print(val_vec_item_con.shape)
print(test_vec_item_con.shape)

(1334281, 5)
(148254, 5)
(693359, 5)


In [32]:
joblib.dump(train_vec_item_con, 'item_enc.pkl')

['item_enc.pkl']

In [31]:
import joblib

### 2.  shipping

In [33]:
#SHIPPING ONE HOT ENCODING
train_vec_shipping = pd.get_dummies(df_train.shipping).values
val_vec_shipping = pd.get_dummies(df_val.shipping).values
test_vec_shipping = pd.get_dummies(df_test.shipping).values


print(train_vec_shipping.shape)
print(val_vec_shipping.shape)
print(test_vec_shipping.shape)

(1334281, 2)
(148254, 2)
(693359, 2)


In [34]:
joblib.dump(train_vec_shipping, 'shipping_enc.pkl')

['shipping_enc.pkl']

### 3.  brand_name

In [35]:
#BRAND NAME ONE HOT ENCODING
label_brand = OneHotEncoder(handle_unknown="ignore")
label_brand.fit(df_train.brand_name_processed.values.reshape(-1,1))

train_vec_brand =label_brand.transform(df_train.brand_name_processed.values.reshape(-1,1))
val_vec_brand = label_brand.transform(df_val.brand_name_processed.values.reshape(-1,1))
test_vec_brand = label_brand.transform(df_test.brand_name_processed.values.reshape(-1,1))

print(train_vec_brand.shape)
print(val_vec_brand.shape)
print(test_vec_brand.shape)

(1334281, 4702)
(148254, 4702)
(693359, 4702)


In [36]:
joblib.dump(label_brand, 'brand_enc.pkl')

['brand_enc.pkl']

### 4.  Tier_1

In [37]:
#TIER1 ONE HOT ENCODING
label_t1 = OneHotEncoder(handle_unknown='ignore')
label_t1.fit(df_train.Tier_1.values.reshape(-1,1))

train_vec_t1 = label_t1.transform(df_train.Tier_1.values.reshape(-1,1))
val_vec_t1 =   label_t1.transform(df_val.Tier_1.values.reshape(-1,1))
test_vec_t1 =   label_t1.transform(df_test.Tier_1.values.reshape(-1,1))


print(train_vec_t1.shape)
print(val_vec_t1.shape)
print(test_vec_t1.shape)

(1334281, 11)
(148254, 11)
(693359, 11)


In [38]:
joblib.dump(label_t1, 'tier_enc.pkl')

['tier_enc.pkl']

### 5.  Tier_2

In [39]:
'''TIER2 ONE HOT ENCODING'''

label_t2 = OneHotEncoder(handle_unknown='ignore')
label_t2.fit(df_train.Tier_2.values.reshape(-1,1))

train_vec_t2 =label_t2.transform(df_train.Tier_2.values.reshape(-1,1))
val_vec_t2 = label_t2.transform(df_val.Tier_2.values.reshape(-1,1))
test_vec_t2 = label_t2.transform(df_test.Tier_2.values.reshape(-1,1))

print(train_vec_t2.shape)
print(val_vec_t2.shape)
print(test_vec_t2.shape)

(1334281, 111)
(148254, 111)
(693359, 111)


In [40]:
joblib.dump(label_t2, 'tier2_enc.pkl')

['tier2_enc.pkl']

### 6. Tier_3

In [41]:
#TIER 3 ONE HOT ENDODING

label_t3 = OneHotEncoder(handle_unknown='ignore')
label_t3.fit(df_train.Tier_3.values.reshape(-1,1))

train_vec_t3 =label_t3.transform(df_train.Tier_3.values.reshape(-1,1))
val_vec_t3 = label_t3.transform(df_val.Tier_3.values.reshape(-1,1))
test_vec_t3 = label_t3.transform(df_test.Tier_3.values.reshape(-1,1))

print(train_vec_t3.shape)
print(train_vec_t3.shape)
print(test_vec_t3.shape)

(1334281, 863)
(1334281, 863)
(693359, 863)


In [42]:
joblib.dump(label_t3, 'tier3_enc.pkl')

['tier3_enc.pkl']

## Text Data - Tfidf
### 7. name_processed

In [43]:
#PROCESSED NAME TFIDF VECTORIZER

tfidf = TfidfVectorizer(ngram_range=(1,2),max_features=50000)
tfidf.fit(df_train.name_processed)

train_vec_name = tfidf.transform(df_train.name_processed)
val_vec_name = tfidf.transform(df_val.name_processed)
test_vec_name = tfidf.transform(df_test.name_processed) 


print(train_vec_name.shape)
print(val_vec_name.shape)
print(test_vec_name.shape)

(1334281, 50000)
(148254, 50000)
(693359, 50000)


In [44]:
joblib.dump(tfidf, 'name_tfidf_enc.pkl')

['name_tfidf_enc.pkl']

### 8. processed_item_description

In [45]:
'''PROCESSED ITEM DESCRIPTION TFIDF VECTORIZATION'''

tfidf_desc = TfidfVectorizer(max_features=50000,ngram_range=(1, 2))
tfidf_desc.fit(df_train.processed_item_description)

train_vec_desc = tfidf_desc.transform(df_train.processed_item_description)
val_vec_desc   = tfidf_desc.transform(df_val.processed_item_description)
test_vec_desc   = tfidf_desc.transform(df_test.processed_item_description) 

print(train_vec_desc.shape)
print(val_vec_desc.shape)
print(test_vec_desc.shape)

(1334281, 50000)
(148254, 50000)
(693359, 50000)


In [46]:
joblib.dump(tfidf_desc, 'desc_tfidf_enc.pkl')

['desc_tfidf_enc.pkl']

### 9. is_missing

In [47]:
#IS MISSIN FEATURE FOR TRAIN DATASET

df_train["is_missing"]  =  (df_train.brand_name_processed=="missing") | (df_train.name_processed =="missing")| (df_train.processed_item_description=="missing")
df_train["is_missing"]  = df_train["is_missing"].astype(int)

In [48]:
#IS MISSING FEATURE FOR VALIDATION DATASET

df_val["is_missing"]  =  (df_val.brand_name_processed=="missing") | (df_val.name_processed =="missing")| (df_val.processed_item_description=="missing")
df_val["is_missing"]  = df_val["is_missing"].astype(int)

In [49]:
#IS MISSING FEATURE FOR TEST DATASET

df_test["is_missing"]  =  (df_test.brand_name_processed=="missing") | (df_test.name_processed =="missing")| (df_test.processed_item_description=="missing")
df_test["is_missing"]  = df_test["is_missing"].astype(int)

## Concatenating

In [50]:
#STACKING ALL THE FEATURES


# STACKING TRAIN FEATURES
x_train = hstack((train_vec_item_con,train_vec_shipping,
                  train_vec_name,train_vec_brand,train_vec_t1,
                   train_vec_t2,
                    train_vec_t3,
                  df_train.is_missing.values.reshape(-1,1)
                   ,train_vec_desc))


# STACKING VALIDATION FEATURES
x_val = hstack((val_vec_item_con,val_vec_shipping
                ,val_vec_name ,val_vec_brand ,val_vec_t1 ,\
                  val_vec_t2 ,val_vec_t3 ,
                df_val.is_missing.values.reshape(-1,1)
                ,val_vec_desc))

# STACKING TEST FEATURES
x_test = hstack((test_vec_item_con,test_vec_shipping
                ,test_vec_name ,test_vec_brand ,test_vec_t1 ,\
                  test_vec_t2 ,test_vec_t3 ,
                df_test.is_missing.values.reshape(-1,1)
                ,test_vec_desc))

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(1334281, 105695)
(148254, 105695)
(693359, 105695)


## C. Models

## Ridge

In [51]:
l2_best = Ridge(alpha=10)
l2_best.fit(x_train,y_train)

Ridge(alpha=10)

In [52]:
# TRAIN PREDICTION
train_pred = l2_best.predict(x_train)
# VALIDATION PREDICTION
val_pred = l2_best.predict(x_val)

In [53]:
# TRAIN ERROR
train_error = np.sqrt(mean_squared_error(y_train,train_pred))
print("Train Error = ",train_error)
# VALIDATION ERROR
val_error = np.sqrt(mean_squared_error(y_val,val_pred))
print("Validation Error = ",val_error)

Train Error =  0.4414232731512226
Validation Error =  0.458116634015336


## Save and Load the Model

In [54]:
import pickle
import joblib


In [55]:
filename = "model.pkl"

In [43]:
# save model
pickle.dump(l2_best, open(filename, "wb"))

In [44]:
# load model
loaded_model = pickle.load(open(filename, "rb"))

In [48]:
loaded_model.predict(x_val)

array([2.33799407, 2.80459348, 3.42413218, ..., 2.63513667, 2.98044463,
       2.40071126])

In [56]:
joblib.dump(l2_best, 'model.pkl')

['model.pkl']

In [69]:
import sys
print(sys.version)

3.10.9 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:15) [MSC v.1916 64 bit (AMD64)]


In [57]:
input_data = {
                'name': "AVA-VIV Blouse",
                'item_condition_id': 1,
                'category_name': "Women/Tops & Blouses/Blouse	",
                'brand_name': "Target",
                'shipping': 1,
                'item_description': "Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!"
            }


In [66]:
l2_best.predict(x_train)

array([2.22584058, 3.27447237, 3.53139173, ..., 2.64531353, 2.90090842,
       4.89792012])