In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/crowdflower-search-relevance/train.csv.zip
/kaggle/input/crowdflower-search-relevance/sampleSubmission.csv.zip
/kaggle/input/crowdflower-search-relevance/test.csv.zip


In [None]:
import os
import wandb
from wandb.keras import WandbCallback
os.system('! wandb login be213aaff4ff14945d480abc18697d8664bba8c8')
training = pd.read_csv('/kaggle/input/crowdflower-search-relevance/train.csv.zip')
stay_training = training
test = pd.read_csv('/kaggle/input/crowdflower-search-relevance/test.csv.zip')
test['relevance_variance'] = np.NaN
training['train_test'] = 1
training = training.drop(['product_description'], axis=1)
test['train_test'] = 0
all_data = pd.concat([training,test]) 
#print(training.head())
#print(test.head())
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
attribute_names = all_data.columns.tolist()
test_atts = test.columns.tolist()
all_dummies = pd.get_dummies(all_data[attribute_names])
test_dummies = pd.get_dummies(test[test_atts])  
X_train = all_dummies[all_data.train_test == 1].drop(['train_test'], axis=1)
for column in X_train.columns:
    mean_value = X_train[column].mean()
    X_train[column].fillna(mean_value, inplace=True)
X_check = X_train #for heatmap
X_train = X_train.drop(['median_relevance'], axis=1)
X_backup = X_train
X_test = all_dummies[all_data.train_test == 0].drop(['train_test'], axis=1)
y_train = all_data[all_data.train_test==1].median_relevance
y_backup = y_train
X_test = X_test.drop(['median_relevance'], axis=1)
for column in X_test.columns:
    mean_value = X_test[column].mean()
    X_test[column].fillna(mean_value, inplace=True)

wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Feature Engineering

In [None]:
x = stay_training.median_relevance.value_counts().keys()
y = stay_training.median_relevance.value_counts()


ax = stay_training.median_relevance.value_counts().plot(kind='bar',color=['C0', 'C1', 'C2','C3'])
for i,j in zip(x,y):
    plt.annotate(str(round(j/stay_training.shape[0]*100,2))+'%', xy=(i,j), ha='center', va='bottom')
plt.title('Bar plot of the target values')
plt.xlabel('Target values')
plt.show()

In [None]:
y = stay_training['query'].value_counts().tolist()
y  = sorted(y,reverse=True)
x = range(len(y))
plt.bar(x,y)
plt.ylabel('Count')
plt.xlabel('Category')
plt.title('Bar plot of the query text categories and the count')
plt.show()

We can see that the data is very skewed, most queries appear only once and a lot of search results seem accurate.

I noticed in the data that some queries are essentially the same but differ in a blank space or a dash so I will put those together using editing distance. 

In [None]:
!pip install editdistance
import editdistance

def merge_similar_attributes(X_train):
    merged_df = X_train.copy()
    attribute_names = X_train.columns.tolist()
    merged_attributes = {}
    for i, attr1 in enumerate(attribute_names):
        for j, attr2 in enumerate(attribute_names[i+1:]):
            distance = editdistance.eval(attr1, attr2)
            if distance == 1:
                merged_name = f"{attr1}_{attr2}"
                merged_df[merged_name] = X_train[attr1] + X_train[attr2]
                merged_attributes[merged_name] = (attr1, attr2)

    return merged_df, merged_attributes

X_train, merged_attributes_dict = merge_similar_attributes(X_train)

In [None]:
print(training)

In [None]:
print(test)

In [None]:
import matplotlib.pyplot as plt

target_variable = 'median_relevance'
numeric_data = training.select_dtypes(include='number')
correlation_matrix = numeric_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix[[target_variable]], annot=True, cmap='coolwarm', linewidths=.5)

plt.title(f'Correlation Heatmap with {target_variable}')
plt.show()

#due to the lack of numerical values, these types of visualization provide little information

In [None]:
'''target_variable = 'relevance_variance'
correlations = X_train.corr()[target_column]
correlations = correlations.drop(target_variable)
# Filter features with a correlation of at least 0.4 or -0.4
significant_features = correlations[(correlations >= 0.1) | (correlations <= -0.1)]
significant_features_index = correlations[(correlations >= 0.1) | (correlations <= -0.1)].index
print(significant_features)
print(significant_features_index)

plt.figure(figsize=(10, 6))
sns.barplot(x=significant_features.values, y=significant_features.index)
plt.title(f'Features Correlating at least 0.1 or -0.1 with {target_column}')
plt.xlabel('Correlation')
plt.ylabel('Features')
plt.show()'''

In [None]:
all_train = pd.concat([X_train, y_train], axis=1)

def create_splits(X_train, y_train, num_splits):
    total_features = X_train.shape[1]
    features_per_subset = total_features // num_splits
    all_train_splits = []
    for i in range(num_splits):
        start_idx = i * features_per_subset
        end_idx = (i + 1) * features_per_subset if i < num_splits - 1 else total_features
        subset_features = X_train.iloc[:, start_idx:end_idx]
        all_train_subset = pd.concat([subset_features, y_train], axis=1)
        all_train_splits.append(all_train_subset)

    return all_train_splits

num_splits = 3500
all_train_splits = create_splits(X_train, y_train, num_splits)


#print(all_train_splits[0].head())
plt.figure(figsize=(16, 8))
correlations = {}

from scipy.stats import pearsonr

def feature_stats(dataset, targ, correlation_threshold=0.05):
    features = dataset.drop(columns=[targ])
    target = dataset[targ]
    correlation_results = [pearsonr(features[feature], target) for feature in features.columns]
    selected_positive = [(feature, corr, p) for (feature, (corr, p)) in zip(features.columns, correlation_results) if corr >= correlation_threshold]
    selected_negative = [(feature, corr, p) for (feature, (corr, p)) in zip(features.columns, correlation_results) if corr <= -correlation_threshold]

    # Create a DataFrame to store the results
    stats_df = pd.DataFrame({
        'Feature': [item[0] for item in selected_positive + selected_negative],
        'Correlation': [item[1] for item in selected_positive + selected_negative],
        'P-Value': [item[2] for item in selected_positive + selected_negative],
    })

    return stats_df

stats_all_train_splits_0 = feature_stats(all_train_splits[0], 'median_relevance')
print('###############')
print(stats_all_train_splits_0)
print('###############')


def plot_feature_heatmap(dataset, targ, correlation_threshold=0.01):
    # Extract features and target
    features = dataset.drop(columns=[targ])  
    target = dataset[targ]
    feature_correlation = features.corrwith(target)
    selected_features = feature_correlation[(feature_correlation >= correlation_threshold) | (feature_correlation <= -correlation_threshold)]

    # Create a DataFrame to store the results
    corr_df = pd.DataFrame({'Correlation': selected_features})
    feature_correlation_list = list(zip(selected_features.index, selected_features.values))
    for feature, correlation in feature_correlation_list:
        print(f"{feature}: {correlation}")
    if not corr_df.empty:
        plt.figure(figsize=(50, 20))
        sns.heatmap(corr_df.transpose(), annot=True, cmap='coolwarm', linewidths=.5)
        plt.title('Feature Correlation Heatmap')
        plt.show()
    #else:
        #print("No features meet the correlation threshold criteria.")

    return feature_correlation_list

#plot_feature_heatmap(all_train_splits[0], 'relevance_variance')
def plot_clustered_heatmap(dataset, targ):
    features = dataset.drop(columns=[targ])  
    target = dataset[targ]
    feature_correlation = features.corrwith(target, method='spearman')
    corr_df = pd.DataFrame({'Correlation': feature_correlation})
    plt.figure(figsize=(20, 10))
    sns.clustermap(features.corr(method='spearman'), cmap='coolwarm', annot=True, linewidths=.5)
    plt.title('Clustered Feature Correlation Heatmap')
    plt.show()



In [None]:
for i in range(200):
    #print('###############')
    #here: plot_feature_heatmap(all_train_splits[i], 'median_relevance')
    plot_clustered_heatmap(all_train_splits[i], 'median_relevance')
    #stats_all_train_splits_0 = feature_stats(all_train_splits[i], 'relevance_variance')
    #print(stats_all_train_splits_0)
    #print('###############')

In [None]:
#plot_significance(all_train_1, all_train_1['relevance_variance'], 'Split 1')

In [None]:
#attribute_names = X_train.columns.tolist()
#print(attribute_names)