In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/crowdflower-search-relevance/train.csv.zip')
test = pd.read_csv('/kaggle/input/crowdflower-search-relevance/test.csv.zip')
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
train['query'].map(lambda x:len(x.split())).value_counts()

In [None]:
train['product_title'].map(lambda x:len(x.split())).value_counts()

In [None]:
split = int(len(train)*0.8)
train_0, dev = train[:split], train[split:]

In [None]:
clean_train_1 = train_0[train_0.relevance_variance <1].copy()
clean_train_2 = train_0[train_0.relevance_variance <0.50].copy()
dev.describe()

In [None]:
clean_train_1.describe()

In [None]:
clean_train_2.describe()

In [None]:
## Skipping product description as it's too lengthy and missing values
train = clean_train_1
train_input = train.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
dev_input =  dev.apply(lambda x: x['query']+' '+x['product_title'], axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf = TfidfVectorizer(ngram_range=(1, 5),stop_words = 'english', strip_accents='unicode')
train_x = tfidf.fit_transform(train_input)
dev_x = tfidf.transform(dev_input)

In [None]:
train_y, dev_y = train.median_relevance.to_list(), dev.median_relevance.to_list()
train_y = [(x-1)/3 for x in train_y]
dev_y = [(x-1)/3 for x in dev_y]
np.mean(train_y), np.max(train_y), np.min(train_y)

In [None]:
from sklearn.metrics import mean_squared_error, cohen_kappa_score, make_scorer
def reg_scorer(true, pred):
    pred = [min(1, max(0,x)) for x in pred]
    pred = [int(round((x*3)+1)) for x in pred]
    true = [int(round((x*3)+1)) for x in true]
    return cohen_kappa_score(true, pred)

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
#clf = LinearRegression().fit(train_x, train_y)
#clf = SGDRegressor(verbose=1,n_iter_no_change=20).fit(train_x, train_y)
param_grid = {'C': [1], 'epsilon':[0.1,0.05], 'kernel': ('linear', 'rbf')}
svr  = SVR()
scorer = make_scorer(reg_scorer, greater_is_better=True)
clf = GridSearchCV(svr, param_grid, verbose=True,scoring=scorer, n_jobs=8)
clf.fit(train_x, train_y)
clf.best_estimator_, clf.best_params_, clf.best_score_

In [None]:
## 0.26 is the best score till now

preds = clf.best_estimator_.predict(dev_x)
mean_squared_error(dev_y, preds),  reg_scorer(dev_y, preds)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor

# Assuming you have train_x, train_y as your training data
X_train, X_dev, y_train, y_dev = train_test_split(train_x, train_y, test_size=0.2, random_state=1)

# SVR
svr_best_estimator = clf.best_estimator_
svr_best_estimator.fit(X_train, y_train)
y_pred_svr = svr_best_estimator.predict(X_dev)
mse_svr = mean_squared_error(y_dev, y_pred_svr)
print(f'Mean Squared Error (SVR): {mse_svr}')

# Random Forest
rf_best_estimator = rf.best_estimator_
rf_best_estimator.fit(X_train, y_train)
y_pred_rf = rf_best_estimator.predict(X_dev)
mse_rf = mean_squared_error(y_dev, y_pred_rf)
print(f'Mean Squared Error (Random Forest): {mse_rf}')

# TfidfVectorizer + SVM
tfidf_svm_model = make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False), SVC())
tfidf_svm_model.fit(X_train, y_train)
y_pred_svm = tfidf_svm_model.predict(X_dev)
mse_svm = mean_squared_error(y_dev, y_pred_svm)
print(f'Mean Squared Error (TfidfVectorizer + SVM): {mse_svm}')

# TfidfVectorizer + Naive Bayes
tfidf_nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
tfidf_nb_model.fit(X_train, y_train)
y_pred_nb = tfidf_nb_model.predict(X_dev)
mse_nb = mean_squared_error(y_dev, y_pred_nb)
print(f'Mean Squared Error (TfidfVectorizer + Naive Bayes): {mse_nb}')

# Create the stacking regressor
base_models = [('svr', svr_best_estimator), ('rf', rf_best_estimator), ('svm', tfidf_svm_model), ('nb', tfidf_nb_model)]
meta_model = LinearRegression()  # You can choose a different meta-model if needed

stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3,  # Number of cross-validation folds
    scoring=make_scorer(mean_squared_error, greater_is_better=False)
)

# Train the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions on dev set
y_pred_ensemble = stacking_regressor.predict(X_dev)

# Evaluate the performance of the ensemble
mse_ensemble = mean_squared_error(y_dev, y_pred_ensemble)
print(f'Mean Squared Error (Ensemble): {mse_ensemble}')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor


X_train, X_dev, y_train, y_dev = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

# Random Forest
rf_best_estimator = rf.best_estimator_
rf_best_estimator.fit(X_train, y_train)
y_pred_rf = rf_best_estimator.predict(X_dev)
mse_rf = mean_squared_error(y_dev, y_pred_rf)
print(f'Mean Squared Error (Random Forest): {mse_rf}')

# TfidfVectorizer + SVM
tfidf_svm_model = make_pipeline(TfidfVectorizer(), StandardScaler(with_mean=False), SVC())
tfidf_svm_model.fit(X_train, y_train)
y_pred_svm = tfidf_svm_model.predict(X_dev)
mse_svm = mean_squared_error(y_dev, y_pred_svm)
print(f'Mean Squared Error (TfidfVectorizer + SVM): {mse_svm}')

# TfidfVectorizer + Naive Bayes
tfidf_nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
tfidf_nb_model.fit(X_train, y_train)
y_pred_nb = tfidf_nb_model.predict(X_dev)
mse_nb = mean_squared_error(y_dev, y_pred_nb)
print(f'Mean Squared Error (TfidfVectorizer + Naive Bayes): {mse_nb}')

# Create the stacking regressor
base_models = [('svr', svr_best_estimator), ('rf', rf_best_estimator), ('svm', tfidf_svm_model), ('nb', tfidf_nb_model)]
meta_model = LinearRegression()  # You can choose a different meta-model if needed

stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3,  # Number of cross-validation folds
    scoring=make_scorer(mean_squared_error, greater_is_better=False)
)

# Train the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions on dev set
y_pred_ensemble = stacking_regressor.predict(X_dev)

# Evaluate the performance of the ensemble
mse_ensemble = mean_squared_error(y_dev, y_pred_ensemble)
print(f'Mean Squared Error (Ensemble): {mse_ensemble}')


In [None]:
test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
test_x = tfidf.transform(test_input)
pred = stacking_regressor.predict(test_x)
pred = [min(1, max(0,x)) for x in pred]
pred = [int(round((x*3)+1)) for x in pred]
out = pd.DataFrame({"id": test.id.to_list(), "prediction": pred})
out.to_csv('submission.csv', index=False)

In [None]:
'''test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
test_x = tfidf.transform(test_input)
pred = clf.best_estimator_.predict(test_x)
pred = [min(1, max(0,x)) for x in pred]
pred = [int(round((x*3)+1)) for x in pred]
out = pd.DataFrame({"id": test.id.to_list(), "prediction": pred})
out.to_csv('submission.csv', index=False)'''

In [None]:
sub = pd.read_csv('/kaggle/input/crowdflower-search-relevance/sampleSubmission.csv.zip')
sub


In [None]:
out