In [1]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xg
from gensim.models import Word2Vec
from lightgbm import LGBMRegressor
from wordcloud import WordCloud, STOPWORDS
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

In [2]:
# LOADING THE  TRAIN DATASET
df =  pd.read_csv(r"C:\Users\Msys\Desktop\Data Science-GSG\train.tsv",index_col=["train_id"],sep='\t')
df.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
# LOADING THE TEST DATASET
df_test  =pd.read_csv(r"C:\Users\Msys\Desktop\Data Science-GSG\test.tsv",index_col=["test_id"],sep='\t')
df_test

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...
...,...,...,...,...,...,...
693354,Quartz crystal on Flint stone,1,Home/Home Décor/Home Décor Accents,,0,Flint/Quartz cluster. Self mined ✨measures 3x2...
693355,It Cosmetics - Travel Bundle,1,Beauty/Makeup/Makeup Sets,IT Cosmetics,1,It Cosmetics travel bundle. Includes: Brow pow...
693356,Galaxy S8 hard shell case,1,"Electronics/Cell Phones & Accessories/Cases, C...",,1,New. Free shipping Basstop case
693357,Hi low floral kimono,2,Women/Swimwear/Cover-Ups,,0,Floral kimono. Tropical print. Open front. Hi ...


## Basic Statictics

In [4]:
#GETTING THE SHAPE OF THE DATASET
df.shape

(1482535, 7)

In [5]:
#GETTING THE BASIC INFO ABOUT THE DATASET
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482535 entries, 0 to 1482534
Data columns (total 7 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   name               1482535 non-null  object 
 1   item_condition_id  1482535 non-null  int64  
 2   category_name      1476208 non-null  object 
 3   brand_name         849853 non-null   object 
 4   price              1482535 non-null  float64
 5   shipping           1482535 non-null  int64  
 6   item_description   1482531 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 90.5+ MB


##### Observation :
* We can observe that out of all the features present category_name,"brand_name" and "item_description" contains some null values.

## Preprocessing

### 'price'

In [6]:
#THIS CELL CREATES NEW COLOMN WITH LOG(PRICE+1)
df["log_price"] = df.price.apply(lambda x:np.log(x+1))
df.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,log_price
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,2.397895
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,3.970292
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,2.397895
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,3.583519
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,3.806662


### 'name'

For text features, there are many techniques that are used for data preprocessing. Here we have done very simply preprocessing which is decontraction, removing special characters, stopword removal, and lowering the text.

In [7]:
#THIS CELL PREPROCESSES THE NAME FEATURE

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

st_words = stopwords.words('english')

def name_process(text):
    #THIS FUNCTION IS USED TO PREPROCESS THE NAME FEATURE
    text = decontracted(text)
    text = re.sub("[^A-Za-z0-9 ]","",text) # REMOVE EVERYTHING EXCEPT THE PROVIDED CHARACTERS
    text = text.lower() # CONVERT TO LOWER CASE
    text =  " ".join([i for i in text.split() if i not in st_words])
    if len(text)==0:
        text = "missing"
    return text # RETURN THE OUTPUT TEXT

In [8]:
# APPLYING THE "preprocessing" FUNCTION ON THE FEAUTRE "name"
df["name_processed"] = df.name.apply(name_process)
df[df.name_processed.isnull()].name_processed ="missing" 

# APPLYING THE "preprocessing" FUNCTION ON THE FEAUTRE "name" on test data
df_test["name_processed"] = df_test.name.apply(name_process)
df_test[df_test.name_processed.isnull()].name_processed ="missing" 

### brand_name

In [9]:
#CREATING PREPROCESSING FUNCTION FOR BRAND NAME
def brand_process(text):
    text = re.sub("[^A-Za-z0-9 ]","",text)# REMOVE EVERYTHING EXCEPT THE PROVIDED CHARACTERS
    text = text.lower()  # CONVERT TO LOWER CASE
    return text

In [10]:
# here we are assigning score correrponding to each brand_name
# the easiest way is to assign the score equals to number of occurences for that brand and store in the form of dict.
brand_score = dict(df[df.brand_name.notnull()]["brand_name"].apply(brand_process).value_counts())

processed_brand_name = [] #storing the barand name after preprocessing
for index,i in tqdm(df.iterrows()) : # for each row in the dataset
    
    if  pd.isnull(i.brand_name): #if the brand name isnull we follow this
        
        words = i.name_processed.split() # we will split the name for that datapoint
        score  = [] # this variable stores the score for each word that we calculated above
        for j in words: # for each word 
            if j in brand_score.keys(): #if the words in name is present in the keys of brand score dict
                score.append(brand_score[j]) # take the score from the dict and append in the score variable
            else: #if the word is not a brand name append -1
                score.append(-1)
        # once we get the scores for all the words in the name the word with maximum score woulb be the brand name
        if max(score) > 0: #if the maximum score is greater than 0 then it contains a brand name so we append the brand name
            processed_brand_name.append(words[score.index(max(score))])
        else: # if maximum value is less than 0 then it means no brand name was found so "missing" is appended
            processed_brand_name.append("missing")
            
    else: # if the brand_name is not null we follow this
        processed_brand_name.append(brand_process(i.brand_name))

1482535it [04:44, 5210.72it/s] 


In [11]:
#CREATING NEW COLUMN WITH PROCESSED BRAND NAMES
df["brand_name_processed"] = processed_brand_name

In [12]:
#APPLYING PROCESSING TO TEST DATASET BRAND NAME
processed_brand_name_test = []
for index,i in tqdm(df_test.iterrows()) :
    
    if  pd.isnull(i.brand_name):
        
        words = i.name_processed.split()
        score  = []
        for j in words:
            if j in brand_score.keys():
                score.append(brand_score[j])
            else:
                score.append(-1)
        
        if max(score) > 0:
            processed_brand_name_test.append(words[score.index(max(score))])
        else:
            processed_brand_name_test.append("missing")
    else:
        processed_brand_name_test.append(brand_process(i.brand_name))

693359it [02:27, 4702.71it/s]


In [13]:
#CREATING NEW COLUMN WITH PROCESSED BRAND NAMES
df_test["brand_name_processed"] = processed_brand_name_test

### category_name

In [14]:
# PREPROCESS THE  "category_name" 

def category_name_preprocessing(text):
    #THIS FUNCTION PREPROCESSES THE TEXT IN "category_name" FEATURE
    text = re.sub("[^A-Za-z0-9/ ]","",text)# REMOVING ALL THE TEXT EXCEPT THE GIVEN CHARACTERS
    text = re.sub("s "," ",text) # REMOVING  "s" AT THE END OF THE WORD
    text = re.sub("s/","/",text) # REMOVING  "s" AT THE END OF THE WORD
    text = re.sub("  "," ",text) # REMOVING ONE SPACE WHERE TWO SPACES ARE PRESENT
    text = text.lower() # CONVERTING THE TEXT TO LOWER CASE
    return text # RETURNING THE PROCESSED TEXT

In [15]:
#THIS CELL REPLACE THE NULL VALUES WITH WORD "missing" AND PREPROCESSES THE category_name FEATURE

# HERE WE ARE REPLACING THE NULL VALUES IN "category_name" WITH WORD "missing"
df.category_name[df.category_name.isnull()] = "missing"
# HERE WE ARE PREPROCESSING THE TEXT IN "category_name"
df["category_name_preprocessed"] = df.category_name.apply(category_name_preprocessing)

In [16]:
# PREPROCESSING ON TEST DATA
df_test.category_name[df_test.category_name.isnull()] = "missing"
df_test["category_name_preprocessed"] = df_test.category_name.apply(category_name_preprocessing)

## Division of category_name
### 'Tier_1'


In [17]:
#THIS CELL SPLITS THE TOTAL CATEGORY NAME WITH "/" AND WHERE NULL IS PRESENT "missing" IS USED

# FORMING A COLUMN "Tier_1"
df["Tier_1"] = df.category_name_preprocessed.apply(lambda x:   x.split("/")[0] if len(x.split("/"))>=1 else "missing")

# PREPROCESSING ON TEST DATA
df_test["Tier_1"] = df_test.category_name_preprocessed.apply(lambda x:   x.split("/")[0] if len(x.split("/"))>=1 else "missing")


### 'Tier_2'

In [18]:
# FORMING A COLUMN "Tier_2"
df["Tier_2"] = df.category_name_preprocessed.apply(lambda x:   x.split("/")[1] if len(x.split("/"))>1 else "missing")
# PREPROCESSING ON TEST DATA
df_test["Tier_2"] = df_test.category_name_preprocessed.apply(lambda x:   x.split("/")[1] if len(x.split("/"))>1 else "missing")

### 'Tier_3'

In [19]:
# FORMING A COLUMN "Tier_3"
df["Tier_3"] = df.category_name_preprocessed.apply(lambda x:   x.split("/")[2] if len(x.split("/"))>1 else "missing")
# PREPROCESSING ON TEST DATA
df_test["Tier_3"] = df_test.category_name_preprocessed.apply(lambda x:   x.split("/")[2] if len(x.split("/"))>1 else "missing")

## 'item_description'

In [20]:
#PREPROCESSING FUNCTION FOR ITEM DESCRIPTION
def processing_item_description(text):
    '''THIS FUNCTION PREPROCESSES THE TEXT IN "item_description"'''
    text = re.sub("\[rm\] ","",str(text))
    text = decontracted(text)
    text = re.sub("[^A-Za-z0-9 ]","",str(text))
    text = str(text).lower()
    text =  " ".join([i for i in text.split() if i not in st_words])
    if len(text)==0:
        text = "missing"
    return text

In [21]:
#REPLACING THE NULL VALUVE WITH WORD "missing"
df.item_description[df.item_description.isnull()]="missing"

#HERE WE ARE PREPROCESSING THE TEXT IN FEATURE "item_description" '''
df["processed_item_description"] = df.item_description.apply(processing_item_description)

In [22]:
# PREPROCESSING FOR TEST DATA
df_test.item_description[df_test.item_description.isnull()]="missing"
df_test["processed_item_description"] = df_test.item_description.apply(processing_item_description)

### PREPROCESSING FOR WORD CLOUD

In [30]:
#THIS CELL JOINS ALL THE NAMES OF SPECIFIC ITEM CONDITON TOGETHER
# PLACEHOLDERS
id_1=""
id_2=""
id_3=""
id_4=""
id_5=""
# FOR EACH ROW IN DATASET
for index ,i in tqdm(df[["name_processed","item_condition_id"]].iterrows()):
    # JOIN THE STRIG TO SPECIFIC ITEM CONDITION ID
    if i.item_condition_id==1:
        id_1+= i.name_processed
    if i.item_condition_id==2:
        id_2+= i.name_processed
    if i.item_condition_id==3:
        id_3+= i.name_processed
    if i.item_condition_id==4:
        id_4+= i.name_processed
    if i.item_condition_id==5:
        id_5+= i.name_processed
    

1482535it [06:43, 3674.46it/s]


In [31]:
# STORING THE STOPWORDS IN A VARIABLE
stopword = stopwords.words('english')

#FORMING WORDCLOUDS
wordcloud1 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_1)
wordcloud2 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_2)
wordcloud3 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_3)
wordcloud4 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_4)
wordcloud5 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_5)

### 10. item_description

### World cloud Analysis For Item Description 

In [33]:
#JOINS ALL THE ITEM DESCRIPTION OF SPECIFIC ITEM CONDITON TOGETHER

# PLACEHOLDER 
id_1_desc=""
id_2_desc=""
id_3_desc=""
id_4_desc=""
id_5_desc=""
# FOR EACH POINTS IN THE DATASET
for index ,i in tqdm(df[["processed_item_description","item_condition_id"]].iterrows()):
    # JOINGING THE SRTING BASED ON ITEM CONDITON ID
    if i.item_condition_id==1:
        id_1_desc += i.processed_item_description
    if i.item_condition_id==2:
        id_2_desc  += i.processed_item_description
    if i.item_condition_id==3:
        id_3_desc  += i.processed_item_description
    if i.item_condition_id==4:
        id_4_desc  += i.processed_item_description
    if i.item_condition_id==5:
        id_5_desc  += i.processed_item_description
    

1482535it [19:58, 1236.90it/s]


In [35]:
#FORMING THE WORD CLOUDS

wordcloud1_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_1_desc)
wordcloud2_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_2_desc)
wordcloud3_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_3_desc)
wordcloud4_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_4_desc)
wordcloud5_desc = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopword,
                min_font_size = 10).generate(id_5_desc)

# Splitting

In [23]:
df_train,df_val = train_test_split(df,test_size=0.1,random_state = 3) 

In [24]:
print("size of df_train : ",df_train.shape)
print("size of df_val :",df_val.shape)
print("size of df_test :",df_test.shape)

size of df_train :  (1334281, 15)
size of df_val : (148254, 15)
size of df_test : (693359, 13)


### Value of Target variables

In [25]:
y_train = df_train.log_price
y_val = df_val.log_price

#### For all the Categorical Data which includes shipping, item condition id, Tier1, Tier2, Tier3 are One hot Encoded

### 1.  item_condition_id

In [26]:
#ITEM CONDITION ID ONE HOT ENDODING
train_vec_item_con = pd.get_dummies(df_train.item_condition_id).values
val_vec_item_con = pd.get_dummies(df_val.item_condition_id).values
test_vec_item_con = pd.get_dummies(df_test.item_condition_id).values

print(train_vec_item_con.shape)
print(val_vec_item_con.shape)
print(test_vec_item_con.shape)

(1334281, 5)
(148254, 5)
(693359, 5)


### 2.  shipping

In [27]:
#SHIPPING ONE HOT ENCODING
train_vec_shipping = pd.get_dummies(df_train.shipping).values
val_vec_shipping = pd.get_dummies(df_val.shipping).values
test_vec_shipping = pd.get_dummies(df_test.shipping).values


print(train_vec_shipping.shape)
print(val_vec_shipping.shape)
print(test_vec_shipping.shape)

(1334281, 2)
(148254, 2)
(693359, 2)


### 3.  brand_name

In [28]:
#BRAND NAME ONE HOT ENCODING
label_brand = OneHotEncoder(handle_unknown="ignore")
label_brand.fit(df_train.brand_name_processed.values.reshape(-1,1))

train_vec_brand =label_brand.transform(df_train.brand_name_processed.values.reshape(-1,1))
val_vec_brand = label_brand.transform(df_val.brand_name_processed.values.reshape(-1,1))
test_vec_brand = label_brand.transform(df_test.brand_name_processed.values.reshape(-1,1))

print(train_vec_brand.shape)
print(val_vec_brand.shape)
print(test_vec_brand.shape)

(1334281, 4702)
(148254, 4702)
(693359, 4702)


### 4.  Tier_1

In [29]:
#TIER 1 ONE HOT ENCODING
label_t1 = OneHotEncoder(handle_unknown='ignore')
label_t1.fit(df_train.Tier_1.values.reshape(-1,1))

train_vec_t1 = label_t1.transform(df_train.Tier_1.values.reshape(-1,1))
val_vec_t1 =   label_t1.transform(df_val.Tier_1.values.reshape(-1,1))
test_vec_t1 =   label_t1.transform(df_test.Tier_1.values.reshape(-1,1))


print(train_vec_t1.shape)
print(val_vec_t1.shape)
print(test_vec_t1.shape)

(1334281, 11)
(148254, 11)
(693359, 11)


### 5.  Tier_2

In [30]:
#TIER 2 ONE HOT ENCODING

label_t2 = OneHotEncoder(handle_unknown='ignore')
label_t2.fit(df_train.Tier_2.values.reshape(-1,1))

train_vec_t2 =label_t2.transform(df_train.Tier_2.values.reshape(-1,1))
val_vec_t2 = label_t2.transform(df_val.Tier_2.values.reshape(-1,1))
test_vec_t2 = label_t2.transform(df_test.Tier_2.values.reshape(-1,1))

print(train_vec_t2.shape)
print(val_vec_t2.shape)
print(test_vec_t2.shape)

(1334281, 111)
(148254, 111)
(693359, 111)


### 6. Tier_3

In [31]:
#TIER 3 ONE HOT ENDODING

label_t3 = OneHotEncoder(handle_unknown='ignore')
label_t3.fit(df_train.Tier_3.values.reshape(-1,1))

train_vec_t3 =label_t3.transform(df_train.Tier_3.values.reshape(-1,1))
val_vec_t3 = label_t3.transform(df_val.Tier_3.values.reshape(-1,1))
test_vec_t3 = label_t3.transform(df_test.Tier_3.values.reshape(-1,1))

print(train_vec_t3.shape)
print(train_vec_t3.shape)
print(test_vec_t3.shape)

(1334281, 863)
(1334281, 863)
(693359, 863)


## Text Data - Tfidf
\
Tfidf states “Term Frequency -Inverse Document Frequency”. It has two parts first part is “Term Frequency” which is a simple ratio of a word to the total number of words in the sentence which gives more value to more occurring words in a sentence. “Inverse Document Frequency” is the second part which is the ratio of total documents and the number of documents in which the wors occurs. IDF gives higher value to the rarer words in the documents. 
### 7. name_processed

In [33]:
#PROCESSED NAME TFIDF VECTORIZER

tfidf = TfidfVectorizer(ngram_range=(1,2),max_features=50000)
tfidf.fit(df_train.name_processed)

train_vec_name = tfidf.transform(df_train.name_processed)
val_vec_name = tfidf.transform(df_val.name_processed)
test_vec_name = tfidf.transform(df_test.name_processed)


print(train_vec_name.shape)
print(val_vec_name.shape)
print(test_vec_name.shape)

(1334281, 50000)
(148254, 50000)
(693359, 50000)


### 8. processed_item_description

In [34]:
# PROCESSED ITEM DESCRIPTION TFIDF VECTORIZATION

tfidf_desc = TfidfVectorizer(max_features=50000,ngram_range=(1, 2))
tfidf_desc.fit(df_train.processed_item_description)

train_vec_desc = tfidf_desc.transform(df_train.processed_item_description)
val_vec_desc   = tfidf_desc.transform(df_val.processed_item_description)
test_vec_desc   = tfidf_desc.transform(df_test.processed_item_description) 

print(train_vec_desc.shape)
print(val_vec_desc.shape)
print(test_vec_desc.shape)

(1334281, 50000)
(148254, 50000)
(693359, 50000)


### 9. is_missing
This feature gives 1 if a value is missing in brand name or name or item description otherwise it is 0.

In [35]:
#IS MISSIN FEATURE FOR TRAIN DATASET

df_train["is_missing"]  =  (df_train.brand_name_processed=="missing") | (df_train.name_processed =="missing")| (df_train.processed_item_description=="missing")
df_train["is_missing"]  = df_train["is_missing"].astype(int)

In [36]:
#IS MISSING FEATURE FOR VALIDATION DATASET

df_val["is_missing"]  =  (df_val.brand_name_processed=="missing") | (df_val.name_processed =="missing")| (df_val.processed_item_description=="missing")
df_val["is_missing"]  = df_val["is_missing"].astype(int)

In [37]:
#IS MISSING FEATURE FOR TEST DATASET

df_test["is_missing"]  =  (df_test.brand_name_processed=="missing") | (df_test.name_processed =="missing")| (df_test.processed_item_description=="missing")
df_test["is_missing"]  = df_test["is_missing"].astype(int)

## Concatenating

In [38]:
#STACKING ALL THE FEATURES


# STACKING TRAIN FEATURES
x_train = hstack((train_vec_item_con,train_vec_shipping,
                  train_vec_name,train_vec_brand,train_vec_t1,
                   train_vec_t2,
                    train_vec_t3,
                  df_train.is_missing.values.reshape(-1,1)
                   ,train_vec_desc))


# STACKING VALIDATION FEATURES
x_val = hstack((val_vec_item_con,val_vec_shipping
                ,val_vec_name ,val_vec_brand ,val_vec_t1 ,\
                  val_vec_t2 ,val_vec_t3 ,
                df_val.is_missing.values.reshape(-1,1)
                ,val_vec_desc))

# STACKING TEST FEATURES
x_test = hstack((test_vec_item_con,test_vec_shipping
                ,test_vec_name ,test_vec_brand ,test_vec_t1 ,\
                  test_vec_t2 ,test_vec_t3 ,
                df_test.is_missing.values.reshape(-1,1)
                ,test_vec_desc))

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(1334281, 105695)
(148254, 105695)
(693359, 105695)


## C. Models

### Linear Regression

In [39]:
#TRAINING LINEAR REGERSSION
LR = LinearRegression()
LR.fit(x_train,y_train)

In [40]:
# TRAIN PREDICTION
train_pred_lr = LR.predict(x_train)
# VALIDATION PREDICTION
val_pred_lr = LR.predict(x_val)

In [45]:
# TRAIN ERROR
train_rmse = np.sqrt(mean_squared_error(y_train,train_pred_lr))
print("Train RMSE = ",train_rmse)
# TEST ERROR
val_rmse = np.sqrt(mean_squared_error(y_val,val_pred_lr))
print("Validation RMSE = ",val_rmse)

Train RMSE =  0.4241131941562906
Validation RMSE =  0.46178733658920706


## Ridge

In [44]:
l2_best = Ridge(alpha=10)
l2_best.fit(x_train,y_train)

In [46]:
# TRAIN PREDICTION
train_pred_rdg = l2_best.predict(x_train)
# VALIDATION PREDICTION
val_pred_rdg = l2_best.predict(x_val)

In [47]:
# TRAIN ERROR
train_rmse_rdg = np.sqrt(mean_squared_error(y_train,train_pred_rdg))
print("Train Error = ",train_rmse_rdg)
# VALIDATION ERROR
val_rmse_rdg = np.sqrt(mean_squared_error(y_val,val_pred_rdg))
print("Validation Error = ",val_rmse_rdg)

Train Error =  0.44111429205313984
Validation Error =  0.45793584188055164


## Lasso

In [51]:
from sklearn.linear_model import Lasso

In [52]:
# Create a Lasso regression model
lasso_model = Lasso(alpha=1.0)  

In [54]:
# Train the model on the training data
lasso_model.fit(x_train, y_train)

In [55]:
# TRAIN PREDICTION
train_pred_lasso = lasso_model.predict(x_train)
# VALIDATION PREDICTION
val_pred_lasso = lasso_model.predict(x_val)

In [56]:
# TRAIN ERROR
train_rmse_lasso = np.sqrt(mean_squared_error(y_train,train_pred_lasso))
print("Train Error = ",train_rmse_lasso)
# VALIDATION ERROR
val_rmse_lasso = np.sqrt(mean_squared_error(y_val,val_pred_lasso))
print("Validation Error = ",val_rmse_lasso)

Train Error =  0.7493290724323619
Validation Error =  0.7481290183704847


## Decision Tree

In [57]:
from sklearn.tree import DecisionTreeRegressor

In [58]:
tree_model = DecisionTreeRegressor(max_depth=3)

In [59]:
# Train the model on the training data
tree_model.fit(x_train, y_train)

In [60]:
# TRAIN PREDICTION
train_pred_tree = tree_model.predict(x_train)
# VALIDATION PREDICTION
val_pred_tree = tree_model.predict(x_val)

In [61]:
# TRAIN ERROR
train_rmse_tree = np.sqrt(mean_squared_error(y_train,train_pred_tree))
print("Train Error = ",train_rmse_tree)
# VALIDATION ERROR
val_rmse_tree = np.sqrt(mean_squared_error(y_val,val_pred_tree))
print("Validation Error = ",val_rmse_tree)

Train Error =  0.7031424790439699
Validation Error =  0.7014345144027383


## Summary of all Models:

In [3]:
from prettytable import PrettyTable

In [4]:
table = PrettyTable(["Model", "Train RMSE", "CV RMSE"])
table.add_row(["Linear Regression",  "0.4241", "0.4617"])
table.add_row(["Ridge", "0.4411", "0.4579"])
table.add_row(["Lasso", "0.7493", "0.7481"])
table.add_row(["Decision Tree ","0.7031", "0.7014"])

print(table)

+-------------------+------------+---------+
|       Model       | Train RMSE | CV RMSE |
+-------------------+------------+---------+
| Linear Regression |   0.4241   |  0.4617 |
|       Ridge       |   0.4411   |  0.4579 |
|       Lasso       |   0.7493   |  0.7481 |
|   Decision Tree   |   0.7031   |  0.7014 |
+-------------------+------------+---------+


- As the main constraint of the given problem statement is to reduce rmsle metric. After training different ML models on the data we find a least RMSLE of 0.4579 on cv data.(Ridge Model)