In [1]:
# Downloading the Dataset Stored in Google Drive
!gdown --id 1gJl34v3SzTbWTtMs0uu2BgsFpnESLezx
!gdown --id 1-7yi62S6FAMadRhNF6fvVL916lssHNSP

Downloading...
From: https://drive.google.com/uc?id=1gJl34v3SzTbWTtMs0uu2BgsFpnESLezx
To: /content/Train_For_Pipeline.csv
100% 1.09G/1.09G [00:09<00:00, 117MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1-7yi62S6FAMadRhNF6fvVL916lssHNSP
To: /content/Test_For_Pipeline.csv
100% 379M/379M [00:04<00:00, 84.7MB/s]


In [2]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from string import punctuation
from tqdm import tqdm
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import one_hot
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import argparse

In [3]:
!pip install simplemma==0.5.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simplemma==0.5.0
  Downloading simplemma-0.5.0-py3-none-any.whl (63.7 MB)
[K     |████████████████████████████████| 63.7 MB 1.2 MB/s 
[?25hInstalling collected packages: simplemma
Successfully installed simplemma-0.5.0


In [4]:
import nltk
import re
import string
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
nltk.download('stopwords')
### Dataset Preprocessing
from simplemma import text_lemmatizer
import simplemma
langdata = simplemma.load_data('ru')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
tr_df=pd.read_csv("Train_For_Pipeline.csv")
te_df=pd.read_csv("Test_For_Pipeline.csv")

In [6]:
from tensorflow.keras import backend as kb
def rmse(y_true,y_pred):
  return kb.sqrt(kb.mean(kb.square(y_pred-y_true)))

In [None]:
def final_pred(data):
  def load_sample(df):
    df=df.sample(n=1,axis=0)
    return df

  def feature_engg(df):
    df['activation_date'] = pd.to_datetime(df['activation_date'], errors = 'coerce')
    df['price']=np.log(df['price']+0.001)
    df['day'] = df['activation_date'].dt.day
    df['dayofweek_name'] = df['activation_date'].dt.day_name
    df['is_weekend'] = np.where(df['dayofweek_name'].isin(['Sunday','Saturday']),1,0)
    df['weekday'] = df['activation_date'].dt.weekday
    df['description_len'] = df['description'].apply(lambda x : len(x.split()))
    df['title_len'] = df['title'].apply(lambda x : len(x.split()))
    df['param_combined'] = df.apply(lambda row: ' '.join([str(row['param_1']), str(row['param_2']),  str(row['param_3'])]), axis=1)
    df['param_combined_len'] = df['param_combined'].apply(lambda x : len(x.split()))
    df['description_char'] = df['description'].apply(len)
    df['title_char'] = df['title'].apply(len)
    df['param_char'] = df['param_combined'].apply(len)
    df['punctuation_count'] = df['description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
    return df

  def data_encoding(df):
    region_enc=pickle.load(open("/content/Region_Encoder.pkl","rb"))
    df['region_enc']=region_enc.transform(df['region'].values.reshape(-1,1))
    city_enc=pickle.load(open("/content/City_Encoder.pkl","rb"))
    df['city_enc']=city_enc.transform(df['city'].values.reshape(-1,1))
    pcn_enc=pickle.load(open("/content/parent_category_name_Encoder.pkl","rb"))
    df['pcn_enc']=pcn_enc.transform(df['parent_category_name'].values.reshape(-1,1))
    cn_enc=pickle.load(open("/content/category_name_Encoder.pkl","rb"))
    df['cn_enc']=cn_enc.transform(df['category_name'].values.reshape(-1,1))
    ut_enc=pickle.load(open("/content/user_type_Encoder.pkl","rb"))
    df['ut_enc']=ut_enc.transform(df['user_type'].values.reshape(-1,1))
    p1_enc=pickle.load(open("/content/param_1_Encoder.pkl","rb"))
    df['p1_enc']=p1_enc.transform(df['param_1'].values.reshape(-1,1))
    p2_enc=pickle.load(open("/content/param_2_Encoder.pkl","rb"))
    df['p2_enc']=p2_enc.transform(df['param_2'].values.reshape(-1,1))
    p3_enc=pickle.load(open("/content/param_3_Encoder.pkl","rb"))
    df['p3_enc']=p3_enc.transform(df['param_3'].values.reshape(-1,1))
    # Numerical Features
  
    price_enc=pickle.load(open("/content/Scaled_price.pkl","rb"))
    df['price_enc']=price_enc.transform(df['price'].values.reshape(-1,1))
    des_len_enc=pickle.load(open("/content/Scaled_description_len.pkl","rb"))
    df['des_len_enc']=des_len_enc.transform(df['description_len'].values.reshape(-1,1))
    tit_len_enc=pickle.load(open("/content/Scaled_title_len.pkl","rb"))
    df['tit_len_enc']=tit_len_enc.transform(df['title_len'].values.reshape(-1,1))
    pc_len_enc=pickle.load(open("/content/Scaled_param_combined_len.pkl","rb"))
    df['pc_len_enc']=pc_len_enc.transform(df['param_combined_len'].values.reshape(-1,1))
    des_char_enc=pickle.load(open("/content/Scaled_description_char.pkl","rb"))
    df['des_char_enc']=des_char_enc.transform(df['description_char'].values.reshape(-1,1))
    tit_char_enc=pickle.load(open("/content/Scaled_title_char.pkl","rb"))
    df['tit_char_enc']=tit_char_enc.transform(df['title_char'].values.reshape(-1,1))
    param_char_enc=pickle.load(open("/content/Scaled_param_char.pkl","rb"))
    df['param_char_enc']=param_char_enc.transform(df['param_char'].values.reshape(-1,1))
    punc_enc=pickle.load(open("/content/Scaled_punctuation_count.pkl","rb"))
    df['punc_enc']=punc_enc.transform(df['punctuation_count'].values.reshape(-1,1))
    avg_red_enc=pickle.load(open("/content/Scaled_average_red.pkl","rb"))
    df['avg_red_enc']=avg_red_enc.transform(df['average_red'].values.reshape(-1,1))
    avg_green_enc=pickle.load(open("/content/Scaled_average_green.pkl","rb"))
    df['avg_green_enc']=avg_green_enc.transform(df['average_green'].values.reshape(-1,1))
    avg_green_enc=pickle.load(open("/content/Scaled_average_green.pkl","rb"))
    df['avg_green_enc']=avg_green_enc.transform(df['average_green'].values.reshape(-1,1))
    avg_blue_enc=pickle.load(open("/content/Scaled_average_blue.pkl","rb"))
    df['avg_blue_enc']=avg_blue_enc.transform(df['average_blue'].values.reshape(-1,1))
    blur_enc=pickle.load(open("/content/Scaled_img_blur.pkl","rb"))
    df['blur_enc']=blur_enc.transform(df['img_blur'].values.reshape(-1,1))
    return df

  def text_preprocess(df,col):
    corpus=[]
    review=re.sub(r'[^\w\s]'," ",df[col].values[0])
    review=re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", review)
    review = re.sub(r"http\S+", " ", review)
    review = BeautifulSoup(review, 'lxml').get_text()
    review = re.sub('_',' ',review)
    review = ' '.join(text_lemmatizer(review, langdata))
    review = ' '.join([word for word in review.split() if not word in stopwords.words('russian')])
    review = ' '.join([word for word in review.split() if not word in (punctuation)])
    review = review.lower()
    corpus.append(review)
    return corpus

  def text_generate(corpus,voc_size,maxleng):
    onehot_repr=[one_hot(words,voc_size)for words in corpus]
    embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=maxleng)
    return embedded_docs


#'price_enc', 'des_len_enc', 'tit_len_enc', 'pc_len_enc', 'des_char_enc',
#       'tit_char_enc', 'param_char_enc', 'punc_enc', 'avg_red_enc',
#       'avg_green_enc', 'blur_enc'

  def predict(df,title,desc):
    num_feat=['price_enc', 'des_len_enc', 'tit_len_enc', 'pc_len_enc', 'des_char_enc',
       'tit_char_enc', 'param_char_enc', 'punc_enc', 'avg_red_enc',
       'avg_green_enc', 'avg_blue_enc', 'blur_enc']
    cat_feat=['day', 'is_weekend', 'weekday','image_top_1','item_seq_number','region_enc', 'city_enc', 'pcn_enc', 'cn_enc',
       'ut_enc', 'p1_enc', 'p2_enc', 'p3_enc']
    model=load_model("/content/LSTM_with_Images.hdf5",custom_objects={'rmse':rmse})
    yp=model.predict([df[cat_feat],
            df[num_feat],
             desc,
             title])
    return yp



  samp=load_sample(data)
  fe_samp=feature_engg(samp)
  enc_samp=data_encoding(fe_samp)
  title_prepro=text_preprocess(fe_samp,'title')
  des_prepro=text_preprocess(fe_samp,'description')
  ttl_pad=text_generate(title_prepro,25000,7)
  des_pad=text_generate(des_prepro,25000,250)
  yp=predict(enc_samp,ttl_pad,des_pad)
  return yp,samp

In [None]:
ypred,sa=final_pred(te_df)

  y = column_or_1d(y, warn=True)


In [None]:
print("Predicted Deal Probability Value is : {}".format(ypred))

Predicted Deal Probability Value is : [[0.2613433]]


In [37]:
def final_pred(data):
  def select_data(df):
    x=df[['region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'activation_date',
       'user_type', 'image', 'image_top_1', 'width',
       'height', 'average_red', 'average_green', 'average_blue', 'image_size',
       'img_blur']]
    y=df['deal_probability']
    return x,y

  def data_split(x,y):
    xtr,xcv,ytr,ycv=train_test_split(x,y,test_size=0.03,shuffle=False)
    return xcv,ycv

  def data_sample(x,y):
    x=x.head(3)
    y=y[:3]
    return x,y

  def feature_engg(df):
    df['activation_date'] = pd.to_datetime(df['activation_date'], errors = 'coerce')
    df['price']=np.log(df['price']+0.001)
    df['day'] = df['activation_date'].dt.day
    df['dayofweek_name'] = df['activation_date'].dt.day_name
    df['is_weekend'] = np.where(df['dayofweek_name'].isin(['Sunday','Saturday']),1,0)
    df['weekday'] = df['activation_date'].dt.weekday
    df['description_len'] = df['description'].apply(lambda x : len(x.split()))
    df['title_len'] = df['title'].apply(lambda x : len(x.split()))
    df['param_combined'] = df.apply(lambda row: ' '.join([str(row['param_1']), str(row['param_2']),  str(row['param_3'])]), axis=1)
    df['param_combined_len'] = df['param_combined'].apply(lambda x : len(x.split()))
    df['description_char'] = df['description'].apply(len)
    df['title_char'] = df['title'].apply(len)
    df['param_char'] = df['param_combined'].apply(len)
    df['punctuation_count'] = df['description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation))) 
    return df

  def data_encoding(df):
    region_enc=pickle.load(open("/content/Region_Encoder.pkl","rb"))
    df['region_enc']=region_enc.transform(df['region'].values.reshape(-1,1))
    city_enc=pickle.load(open("/content/City_Encoder.pkl","rb"))
    df['city_enc']=city_enc.transform(df['city'].values.reshape(-1,1))
    pcn_enc=pickle.load(open("/content/parent_category_name_Encoder.pkl","rb"))
    df['pcn_enc']=pcn_enc.transform(df['parent_category_name'].values.reshape(-1,1))
    cn_enc=pickle.load(open("/content/category_name_Encoder.pkl","rb"))
    df['cn_enc']=cn_enc.transform(df['category_name'].values.reshape(-1,1))
    ut_enc=pickle.load(open("/content/user_type_Encoder.pkl","rb"))
    df['ut_enc']=ut_enc.transform(df['user_type'].values.reshape(-1,1))
    p1_enc=pickle.load(open("/content/param_1_Encoder.pkl","rb"))
    df['p1_enc']=p1_enc.transform(df['param_1'].values.reshape(-1,1))
    p2_enc=pickle.load(open("/content/param_2_Encoder.pkl","rb"))
    df['p2_enc']=p2_enc.transform(df['param_2'].values.reshape(-1,1))
    p3_enc=pickle.load(open("/content/param_3_Encoder.pkl","rb"))
    df['p3_enc']=p3_enc.transform(df['param_3'].values.reshape(-1,1))
    # Numerical Features
  
    price_enc=pickle.load(open("/content/Scaled_price.pkl","rb"))
    df['price_enc']=price_enc.transform(df['price'].values.reshape(-1,1))
    des_len_enc=pickle.load(open("/content/Scaled_description_len.pkl","rb"))
    df['des_len_enc']=des_len_enc.transform(df['description_len'].values.reshape(-1,1))
    tit_len_enc=pickle.load(open("/content/Scaled_title_len.pkl","rb"))
    df['tit_len_enc']=tit_len_enc.transform(df['title_len'].values.reshape(-1,1))
    pc_len_enc=pickle.load(open("/content/Scaled_param_combined_len.pkl","rb"))
    df['pc_len_enc']=pc_len_enc.transform(df['param_combined_len'].values.reshape(-1,1))
    des_char_enc=pickle.load(open("/content/Scaled_description_char.pkl","rb"))
    df['des_char_enc']=des_char_enc.transform(df['description_char'].values.reshape(-1,1))
    tit_char_enc=pickle.load(open("/content/Scaled_title_char.pkl","rb"))
    df['tit_char_enc']=tit_char_enc.transform(df['title_char'].values.reshape(-1,1))
    param_char_enc=pickle.load(open("/content/Scaled_param_char.pkl","rb"))
    df['param_char_enc']=param_char_enc.transform(df['param_char'].values.reshape(-1,1))
    punc_enc=pickle.load(open("/content/Scaled_punctuation_count.pkl","rb"))
    df['punc_enc']=punc_enc.transform(df['punctuation_count'].values.reshape(-1,1))
    avg_red_enc=pickle.load(open("/content/Scaled_average_red.pkl","rb"))
    df['avg_red_enc']=avg_red_enc.transform(df['average_red'].values.reshape(-1,1))
    avg_green_enc=pickle.load(open("/content/Scaled_average_green.pkl","rb"))
    df['avg_green_enc']=avg_green_enc.transform(df['average_green'].values.reshape(-1,1))
    avg_green_enc=pickle.load(open("/content/Scaled_average_green.pkl","rb"))
    df['avg_green_enc']=avg_green_enc.transform(df['average_green'].values.reshape(-1,1))
    avg_blue_enc=pickle.load(open("/content/Scaled_average_blue.pkl","rb"))
    df['avg_blue_enc']=avg_blue_enc.transform(df['average_blue'].values.reshape(-1,1))
    blur_enc=pickle.load(open("/content/Scaled_img_blur.pkl","rb"))
    df['blur_enc']=blur_enc.transform(df['img_blur'].values.reshape(-1,1))
    return df

  def text_preprocess(df,col):
    corpus=[]
    for i in tqdm(range(0,len(df[col]))):
      review=re.sub(r'[^\w\s]'," ",df[col].values[i])
      review=re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", review)
      review = re.sub(r"http\S+", " ", review)
      review = BeautifulSoup(review, 'lxml').get_text()
      review = re.sub('_',' ',review)
      review = ' '.join(text_lemmatizer(review, langdata))
      review = ' '.join([word for word in review.split() if not word in stopwords.words('russian')])
      review = ' '.join([word for word in review.split() if not word in (punctuation)])
      review = review.lower()
      corpus.append(review)
    return corpus

  def text_generate(corpus,voc_size,maxleng):
    onehot_repr=[one_hot(words,voc_size)for words in corpus]
    embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=maxleng)
    return embedded_docs


#'price_enc', 'des_len_enc', 'tit_len_enc', 'pc_len_enc', 'des_char_enc',
#       'tit_char_enc', 'param_char_enc', 'punc_enc', 'avg_red_enc',
#       'avg_green_enc', 'blur_enc'

  def predict(xcv,ycv,title,desc):
    num_feat=['price_enc', 'des_len_enc', 'tit_len_enc', 'pc_len_enc', 'des_char_enc',
       'tit_char_enc', 'param_char_enc', 'punc_enc', 'avg_red_enc',
       'avg_green_enc', 'avg_blue_enc', 'blur_enc']
    cat_feat=['day', 'is_weekend', 'weekday','image_top_1','item_seq_number','region_enc', 'city_enc', 'pcn_enc', 'cn_enc',
       'ut_enc', 'p1_enc', 'p2_enc', 'p3_enc']
    model=load_model("/content/LSTM_with_Images.hdf5",custom_objects={'rmse':rmse})
    ypred=model.predict([xcv[cat_feat],
            xcv[num_feat],
             desc,
             title])
    return ycv,ypred
    



  x,y=select_data(data)
  xcv,ycv=data_split(x,y)
  xcv,ycv=data_sample(xcv,ycv)
  xcv=feature_engg(xcv)
  xcv=data_encoding(xcv)
  title_prepro=text_preprocess(xcv,'title')
  des_prepro=text_preprocess(xcv,'description')
  ttl_pad=text_generate(title_prepro,25000,7)
  des_pad=text_generate(des_prepro,25000,250)
  ycv,ypred=predict(xcv,ycv,ttl_pad,des_pad)
  return ycv,ypred

In [38]:
yact,ypred=final_pred(tr_df)

  y = column_or_1d(y, warn=True)
100%|██████████| 3/3 [00:00<00:00, 651.76it/s]
100%|██████████| 3/3 [00:00<00:00, 422.80it/s]


In [39]:
print(yact.shape,ypred.shape)

(3,) (3, 1)


In [40]:
ya=np.array(yact)

In [41]:
ya.shape

(3,)

In [42]:
ya=ya.reshape(-1,1)
ya.shape

(3, 1)

In [43]:
print(rmse(ya,ypred))

tf.Tensor(0.10544068104368096, shape=(), dtype=float64)
