In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/crowdflower-search-relevance/train.csv.zip')
test = pd.read_csv('/kaggle/input/crowdflower-search-relevance/test.csv.zip')
train.head()

In [None]:
from bs4 import BeautifulSoup
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()


train['query'] = train['query'].apply(remove_html_tags)
test['query'] = test['query'].apply(remove_html_tags)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
train['query'].map(lambda x:len(x.split())).value_counts()

In [None]:
train['product_title'].map(lambda x:len(x.split())).value_counts()

Feature engineering before vector

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction import text
import string

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Stopwords and Punctuation
stop_words = ['http', 'www', 'img', 'border', 'color', 'style', 'padding', 'table', 'font', 'thi', 'inch', 'ha', 'width', 'height',
              '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

punct = string.punctuation
punct_re = re.compile('[{}]'.format(re.escape(punct)))

columns_to_preprocess = ['query', 'product_title', 'product_description']

def preprocess_text_column(df, column):
    df[column] = df[column].apply(lambda x: preprocess(str(x)))
    return df

def preprocess(x):
    x = x.lower()
    x = punct_re.sub(' ', x)
    new_x = []
    for token in x.split(' '):
        if token not in stop_words:
            new_x.append(stemmer.stem(token))
    return ' '.join(new_x)

# Example usage:
# Assuming you have 'train' and 'test' DataFrames
train = preprocess_text_column(train, 'query')
train = preprocess_text_column(train, 'product_title')
train = preprocess_text_column(train, 'product_description')

test = preprocess_text_column(test, 'query')
test = preprocess_text_column(test, 'product_title')
test = preprocess_text_column(test, 'product_description')


In [None]:
split = int(len(train)*0.8)
train_0, dev = train[:split], train[split:]

In [None]:
clean_train_1 = train_0[train_0.relevance_variance <1].copy()
clean_train_2 = train_0[train_0.relevance_variance <0.50].copy()
dev.describe()

In [None]:
train = clean_train_1
train_input = train.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
dev_input =  dev.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer

class FeatureInserter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def compute_distances(self, X):
        distances = []
        quasi_jaccard = []

        for row in X:
            cos_distance = linear_kernel(row[:row.shape[0]//2], row[row.shape[0]//2:])
            distances.append(cos_distance[0])
            
            intersect = row[:row.shape[0]//2].dot(row[row.shape[0]//2:])
            union = (row[:row.shape[0]//2] + row[row.shape[0]//2:]).dot((row[:row.shape[0]//2] + row[row.shape[0]//2:]))
            quasi_jaccard.append(1.0 * intersect / union)

        return np.array(distances), np.array(quasi_jaccard)

    def transform(self, X, y=None):
        distances, quasi_jaccard = self.compute_distances(X)
        return hstack([X, distances.reshape(-1, 1), quasi_jaccard.reshape(-1, 1)])

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y)
        return self.transform(X)


feature_inserter = FeatureInserter()
train_x = feature_inserter.fit_transform(train_input)
dev_x = feature_inserter.transform(dev_input)
test_x = feature_inserter.transform(test_input)

# Apply TF-IDF vectorization
tfidf = TfidfVectorizer(ngram_range=(1, 5), stop_words='english', strip_accents='unicode')
train_x = tfidf.fit_transform(train_x)
dev_x = tfidf.transform(dev_x)
test_x = tfidf.transform(test_x)

# Now train_x and dev_x contain the TF-IDF features along with the additional distances and quasi-Jaccard features


In [None]:
print(train_x)

In [None]:
clean_train_2.describe()

In [None]:
'''from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf = TfidfVectorizer(ngram_range=(1, 5),stop_words = 'english', strip_accents='unicode')
train_x = tfidf.fit_transform(train_input)
dev_x = tfidf.transform(dev_input)
test_x = tfidf.transform(test_input)'''

In [None]:
print(train_input)

In [None]:
train_y, dev_y = train.median_relevance.to_list(), dev.median_relevance.to_list()
train_y = [(x-1)/3 for x in train_y]
dev_y = [(x-1)/3 for x in dev_y]
np.mean(train_y), np.max(train_y), np.min(train_y)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from scipy import sparse
from sklearn.metrics.pairwise import linear_kernel
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import PorterStemmer
import string
from sklearn.feature_extraction import text

tfidf = TfidfVectorizer(ngram_range=(1, 5), stop_words='english', strip_accents='unicode')
train_x_tfidf = tfidf.fit_transform(train_input)
dev_x_tfidf = tfidf.transform(dev_input)
test_x_tfidf = tfidf.transform(test_input)

svd = TruncatedSVD(n_components=100)
train_x_tfidf = svd.fit_transform(train_x_tfidf)
dev_x_tfidf = svd.transform(dev_x_tfidf)
test_x_tfidf = svd.transform(test_x_tfidf)

# Add new features
distances = []
quasi_jaccard = []

for row in train_x_tfidf:
    cos_distance = linear_kernel(row[:len(row)//2].reshape(1, -1), row[len(row)//2:].reshape(1, -1))[0][0]
    intersect = np.dot(row[:len(row)//2], row[len(row)//2:])
    union = np.sum(row[:len(row)//2]) + np.sum(row[len(row)//2:])
    quasi_jaccard.append(1.0 * intersect / union)
    distances.append(cos_distance)

train_x = np.column_stack([train_x_tfidf, np.array([distances, quasi_jaccard]).T])

distances = []
quasi_jaccard = []

for row in dev_x_tfidf:
    cos_distance = linear_kernel(row[:len(row)//2].reshape(1, -1), row[len(row)//2:].reshape(1, -1))[0][0]
    intersect = np.dot(row[:len(row)//2], row[len(row)//2:])
    union = np.sum(row[:len(row)//2]) + np.sum(row[len(row)//2:])
    quasi_jaccard.append(1.0 * intersect / union)
    distances.append(cos_distance)

dev_x = np.column_stack([dev_x_tfidf, np.array([distances, quasi_jaccard]).T])

distances = []
quasi_jaccard = []

for row in test_x_tfidf:
    cos_distance = linear_kernel(row[:len(row)//2].reshape(1, -1), row[len(row)//2:].reshape(1, -1))[0][0]
    intersect = np.dot(row[:len(row)//2], row[len(row)//2:])
    union = np.sum(row[:len(row)//2]) + np.sum(row[len(row)//2:])
    quasi_jaccard.append(1.0 * intersect / union)
    distances.append(cos_distance)

test_x = np.column_stack([test_x_tfidf, np.array([distances, quasi_jaccard]).T])


In [None]:
'''import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from scipy import sparse
from sklearn.metrics.pairwise import linear_kernel
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import PorterStemmer
import string
from sklearn.feature_extraction import text

# ... (previous imports and code)

class FeatureInserter():
    def __init__(self):
        pass

    def transform(self, X, y=None):
        distances = []
        quasi_jaccard = []

        for row in X.tocsr():
            row = row.toarray().ravel()

            if len(row.shape) == 1:
                row = row.reshape(1, -1)

            # Split the row into two parts
            part1, part2 = row[:, :row.shape[1]//2], row[:, row.shape[1]//2:]

            # Ensure both matrices have the same number of features
            min_features = min(part1.shape[1], part2.shape[1])

            # Check for zero denominator to avoid division by zero
            denominator = np.linalg.norm(part1[:, :min_features]) * np.linalg.norm(part2[:, :min_features])
            if denominator == 0:
                cos_distance = 0  # or any other suitable value
            else:
                cos_distance = 1.0 - np.dot(part1[:, :min_features], part2[:, :min_features].T) / denominator

            # Compute quasi-Jaccard similarity
            intersect = np.sum(np.minimum(part1[:, :min_features], part2[:, :min_features]))
            union = np.sum(np.maximum(part1[:, :min_features], part2[:, :min_features]))
            quasi_jaccard.append(1.0 * intersect / union)

            distances.append(cos_distance[0])

        return np.column_stack([X.toarray(), np.array([distances, quasi_jaccard]).T])

    def fit(self, X, y):
        return self

    def fit_transform(self, X, y, **fit_params):
        self.fit(X, y)
        return self.transform(X)

# Assuming train_input and dev_input are your raw text data
feature_inserter = FeatureInserter()
train_x_with_features = feature_inserter.fit_transform(train_input, train_y)
dev_x_with_features = feature_inserter.transform(dev_input)

# Assuming train_x and dev_x are your vectorized data
tfidf = TfidfVectorizer(ngram_range=(1, 5), stop_words='english', strip_accents='unicode')
train_x_tfidf = tfidf.fit_transform(train_input)
dev_x_tfidf = tfidf.transform(dev_input)

# Concatenate the new features with the TF-IDF vectorized data
train = np.column_stack([train_x_tfidf.toarray(), train_x_with_features])
dev = np.column_stack([dev_x_tfidf.toarray(), dev_x_with_features])'''

In [None]:
'''import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import KFold
from scipy import sparse
from sklearn.metrics.pairwise import linear_kernel
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import PorterStemmer
import string
from sklearn.feature_extraction import text

# ... (previous imports and code)

class FeatureInserter():
    def __init__(self):
        pass

    def transform(self, X, y=None):
        distances = []
        quasi_jaccard = []

        for row in X.tocsr():
            row = row.toarray().ravel()

            if len(row.shape) == 1:
                row = row.reshape(1, -1)

            # Split the row into two parts
            part1, part2 = row[:, :row.shape[1]//2], row[:, row.shape[1]//2:]

            # Ensure both matrices have the same number of features
            min_features = min(part1.shape[1], part2.shape[1])

            # Check for zero denominator to avoid division by zero
            denominator = np.linalg.norm(part1[:, :min_features]) * np.linalg.norm(part2[:, :min_features])
            if denominator == 0:
                cos_distance = 0  # or any other suitable value
            else:
                cos_distance = 1.0 - np.dot(part1[:, :min_features], part2[:, :min_features].T) / denominator

            # Compute quasi-Jaccard similarity
            intersect = np.sum(np.minimum(part1[:, :min_features], part2[:, :min_features]))
            union = np.sum(np.maximum(part1[:, :min_features], part2[:, :min_features]))
            quasi_jaccard.append(1.0 * intersect / union)

            distances.append(cos_distance[0])

        return np.column_stack([X.toarray(), np.array([distances, quasi_jaccard]).T])

    def fit(self, X, y):
        return self

    def fit_transform(self, X, y, **fit_params):
        self.fit(X, y)
        return self.transform(X)


# Example usage:
feature_inserter = FeatureInserter()

# Apply TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 5), stop_words='english', strip_accents='unicode')
train_x_tfidf = tfidf.fit_transform(train_input)
dev_x_tfidf = tfidf.transform(dev_input)

# Apply FeatureInserter
train_x = feature_inserter.fit_transform(train_x_tfidf, train_y)
dev_x = feature_inserter.transform(dev_x_tfidf)

'''

In [None]:
'''train_y, dev_y = train.median_relevance.to_list(), dev.median_relevance.to_list()
train_y = [(x-1)/3 for x in train_y]
dev_y = [(x-1)/3 for x in dev_y]
np.mean(train_y), np.max(train_y), np.min(train_y)'''

In [None]:
from sklearn.metrics import mean_squared_error, cohen_kappa_score, make_scorer
def reg_scorer(true, pred):
    pred = [min(1, max(0,x)) for x in pred]
    pred = [int(round((x*3)+1)) for x in pred]
    true = [int(round((x*3)+1)) for x in true]
    return cohen_kappa_score(true, pred)

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
#clf = LinearRegression().fit(train_x, train_y)
#clf = SGDRegressor(verbose=1,n_iter_no_change=20).fit(train_x, train_y)
param_grid = {'C': [1,2,5,10], 'epsilon':[0.1,0.2,0.5], 'kernel': ('linear', 'rbf', 'poly','sigmoid')}
svr  = SVR()
scorer = make_scorer(reg_scorer, greater_is_better=True)
clf = GridSearchCV(svr, param_grid, verbose=True,scoring=scorer, n_jobs=-1)
clf.fit(train_x, train_y)
clf.best_estimator_, clf.best_params_, clf.best_score_

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Random Forest Regression
random_forest_regression = RandomForestRegressor()

# Parameter Grid
'''random_forest_regression_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}'''
random_forest_regression_param_grid = {
    #'n_estimators': [50, 200],
    #'max_depth': [None, 10, 30],
    #'min_samples_split': [2, 5],
    #'min_samples_leaf': [2, 4],
    'bootstrap': [True, False]
}
#RF HERE
#rf = GridSearchCV(random_forest_regression, random_forest_regression_param_grid, verbose=True,scoring=scorer, n_jobs=-1, cv=2)
#rf.fit(train_x, train_y)
#rf.best_estimator_, rf.best_params_, rf.best_score_

In [None]:
## 0.26 is the best score till now

preds = clf.best_estimator_.predict(dev_x)
mean_squared_error(dev_y, preds),  reg_scorer(dev_y, preds)

#preds_rf = rf.best_estimator_.predict(dev_x)
#mean_squared_error(dev_y, preds_rf),  reg_scorer(dev_y, preds_rf)

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

'''svr_best_estimator = clf.best_estimator_
rf_best_estimator = rf.best_estimator_

base_models = [('svr', svr_best_estimator), ('rf', rf_best_estimator)]
meta_model = LinearRegression()  # You can choose a different meta-model if needed

stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

stacking_regressor.fit(train_x, train_y)
preds_stacking_regressor = stacking_regressor.predict(dev_x)
mean_squared_error(dev_y, preds_stacking_regressor),  reg_scorer(dev_y, preds_stacking_regressor)'''

In [None]:
#test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
#test_x = tfidf.transform(test_input)
'''pred = stacking_regressor.predict(test_x)
pred = [min(1, max(0,x)) for x in pred]
pred = [int(round((x*3)+1)) for x in pred]
out = pd.DataFrame({"id": test.id.to_list(), "prediction": pred})
out.to_csv('submission.csv', index=False)'''

In [None]:
#test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
#test_x = tfidf.transform(test_input)
pred = clf.best_estimator_.predict(test_x)
pred = [min(1, max(0,x)) for x in pred]
pred = [int(round((x*3)+1)) for x in pred]
out = pd.DataFrame({"id": test.id.to_list(), "prediction": pred})
out.to_csv('submission.csv', index=False)

In [None]:
'''test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
test_x = tfidf.transform(test_input)
pred = rf.best_estimator_.predict(test_x)
pred = [min(1, max(0,x)) for x in pred]
pred = [int(round((x*3)+1)) for x in pred]
out = pd.DataFrame({"id": test.id.to_list(), "prediction": pred})
out.to_csv('submission.csv', index=False)'''

In [None]:
sub = pd.read_csv('/kaggle/input/crowdflower-search-relevance/sampleSubmission.csv.zip')
sub

In [None]:
out