In [1]:
import nltk
import numpy as np
import pandas as pd
import Levenshtein
from scipy import spatial
import pyspark.sql.functions as F

from pyspark.sql import SparkSession, DataFrame
from gensim import utils, corpora, models, similarities

In [None]:
stemmer = nltk.SnowballStemmer("english")
spark = SparkSession.builder.master("local[8]").appName("DSSM_features").getOrCreate()

In [None]:
get_stem: F.udf = F.udf(lambda row: " ".join([stemmer.stem(word) for word in row.split()]))

@F.udf
def get_dist(col1: str, col2: str) -> str:
    return Levenshtein.ratio(col1, col2)

In [None]:
df: DataFrame = (spark
                 .read
                 .csv("data/train.csv", header=True)
                 .union(spark
                        .read
                        .csv("data/test.csv", header=True)
                        .withColumn("relevance", F.lit(0)))
                 .join(spark.read.csv("data/product_descriptions.csv", header=True), "product_uid"))

terms: DataFrame = (df
                    .withColumn("search_term", get_stem(F.lower(F.col("search_term"))))
                    .withColumn("product_title", get_stem(F.lower(F.col("product_title"))))
                    .withColumn("product_description", get_stem(F.lower(F.col("product_description")))))

dist: DataFrame = (terms
                   .withColumn("title_dist", get_dist(F.col("search_term"), F.col("product_title")))
                   .withColumn("desc_dist", get_dist(F.col("search_term"), F.col("product_description")))
                   .withColumn("con",
                               F.concat(F.col("product_title"), F.lit("."), F.col("product_description"), F.lit("."))))

In [None]:
lexems: list = [row.con for row in dist.select("con").collect()]
dict: corpora.Dictionary = corpora.Dictionary(list(utils.tokenize(row)) for row in lexems)

In [None]:
def get_term():
    for row in lexems:
        yield dict.doc2bow(list(utils.tokenize(row)))

tfidf = models.tfidfmodel.TfidfModel(get_term())

In [None]:
def tf(row: str) -> str:
    return tfidf[dict.doc2bow(list(utils.tokenize(row)))]

@F.udf
def similarity(col1: str, col2: str) -> float:
    return float(similarities.MatrixSimilarity([tf(col1)], num_features=len(dict))[tf(col2)][0])

In [None]:
simil: DataFrame = (dist
                    .withColumn("sim_title", similarity(F.col("search_term"), F.col("product_title")))
                    .withColumn("sim_desc", similarity(F.col("search_term"), F.col("product_description"))))

In [None]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
sentences = [tokenizer.tokenize(row) for row in lexems]
sentences = [y for x in sentences for y in x]
w2v = [nltk.tokenize.word_tokenize(row) for row in sentences]

In [None]:
model = models.word2vec.Word2Vec(w2v, vector_size=128, window=5, min_count=5, workers=4)

In [None]:
def gv(row: str) -> float:
    res = np.zeros([128])
    count = 0
    for word in nltk.tokenize.word_tokenize(row):
        res += model.wv[word]
        count += 1
    return res / count


@F.udf
def w2v_similarity(col1: str, col2: str) -> float:
    try:
        return float(1 - spatial.distance.cosine(gv(col1), gv(col2)))
    except:
        return float(0)

In [None]:
w2c_df: DataFrame = (simil
                     .withColumn("w2v_title", w2v_similarity(F.col("search_term"), F.col("product_title")))
                     .withColumn("w2v_desc", w2v_similarity(F.col("search_term"), F.col("product_description")))
                     .drop("search_term", "product_title", "product_description", "con")
                     .coalesce(1).write.mode("overwrite").csv("data/features", header=True))

In [110]:
features: pd.DataFrame = pd.read_csv('data/features.csv', encoding = "ISO-8859-1")
df_test: pd.DataFrame = pd.read_csv('data/test.csv', encoding = "ISO-8859-1").drop(["search_term", "product_title", "product_uid"], axis=1)
df_train: pd.DataFrame = pd.read_csv('data/train.csv', encoding = "ISO-8859-1").drop(["search_term", "product_title", "product_uid", "relevance"], axis=1)

In [113]:
train = pd.merge(features, df_train, on="id")
test = pd.merge(features, df_test, on="id")

test_ids = test['id']
y_train = train['relevance'].values


X_train = train.drop(['id', 'relevance'], axis=1).values
X_test = test.drop(['id', 'relevance'], axis = 1).values

In [120]:
rf = RandomForestRegressor(n_estimators=30, max_depth=6)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
result = pd.DataFrame({"id": test_ids, "relevance": y_pred})

In [124]:
result.to_csv("submission.csv", index=False)