### Quora question pair similarity 

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:

df= pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


  df_test = pd.read_csv('test.csv')


In [21]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [22]:
df['question1'].apply(lambda X: re.sub("[^A-Za-z1-9 ]", "", X))

0         What is the step by step guide to invest in sh...
1            What is the story of Kohinoor KohiNoor Diamond
2         How can I increase the speed of my internet co...
3          Why am I mentally very lonely How can I solve it
4         Which one dissolve in water quikly sugar salt ...
                                ...                        
404285    How many keywords are there in the Racket prog...
404286             Do you believe there is life after death
404287                                     What is one coin
404288    What is the approx annual cost of living while...
404289                 What is like to have sex with cousin
Name: question1, Length: 404287, dtype: object

In [4]:
df.head()
df.isnull().sum()
df.dropna(inplace=True)

In [None]:
import nltk
nltk.download('punkt')  # Download the word tokenizer
nltk.download('stopwords')  # Download the stopwords corpus
nltk.download('wordnet')  # Download the WordNet lemmatizer
nltk.download('maxent_ne_chunker')
nltk.download('words')

#### 1. Text Data Processing 

In [23]:
en_stopwords =nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# Preprocess the text data
def process(text):
    text = re.sub("[^A-Za-z1-9 ]", "", text)
    text = text.lower()
    tokens = word_tokenize(text)
    clean_list = []
    for token in tokens:
        if token not in en_stopwords:
            clean_list.append(lemmatizer.lemmatize(token))
    return " ".join(clean_list)

df['question1'] = df['question1'].apply(process)
df['question2'] = df['question2'].apply(process)

# Split the dataset into training and testing sets
X = df[['question1', 'question2']]
y = df['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

combined_data = X_train['question1'] + ' ' + X_train['question2']
combined_data1 = X_test['question1'] + ' ' + X_test['question2']

#### 2. Using LogisticRegression

In [52]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training set
vectorizer.fit(combined_data)
X_train_tfidf = vectorizer.transform(combined_data)
# Transform the testing set
X_test_tfidf = vectorizer.transform(combined_data1)

# Choose a supervised algorithm (e.g., Logistic Regression)
model = LogisticRegression(max_iter=500)

# Train the model on the TF-IDF features and labels
model.fit(X_train_tfidf, y_train)

# Predict the labels for the testing set
y_pred = model.predict(X_test_tfidf)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the similarity between the questions based on the predicted labels
similarity = pd.DataFrame({'Question1': X_test['question1'], 'Question2': X_test['question2'], 'Similarity': y_pred})
print(similarity)


Accuracy: 0.7522817779316827
                                                Question1   
8067                                 play pokmon go korea  \
224279                     breathing treatment help cough   
252452                  kellyanne conway annoying opinion   
174039                       rate 11 review maruti baleno   
384863                                good book marketing   
...                                                   ...   
37520       reason behind sudden end white collar tv show   
75814                                        ever sex car   
79271            recent research idea convex hull problem   
25953   choose reading either fiction nonfiction rest ...   
99888                                 iitpal app launched   

                                                Question2  Similarity  
8067                                 play pokmon go china           0  
224279           help someone unconscious still breathing           0  
252452  kellyanne conw


    Pair Cosine Similarity 

In [58]:
y_pred_reshaped = y_pred.reshape(1, -1)  # Reshape y_pred to have the same number of features as X_test_tfidf

from sklearn.metrics.pairwise import cosine_similarity

X_test_tfidf_transposed = X_test_tfidf.T  # Transpose X_test_tfidf to have shape (87465, 80858)
similarity_matrix = cosine_similarity(X_test_tfidf_transposed, y_pred_reshaped)


#### 3. Using ComplementNB
    - Pipeline 
    - RandomizedSearchCV

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
import numpy as np
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from time import time
import pandas as pd

pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", ComplementNB()),
    ]
)
pipeline

parameter_grid = {
    "vect__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    "vect__min_df": (1, 3, 5, 10),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    "vect__norm": ("l1", "l2"),
    "clf__alpha": np.logspace(-6, 6, 13),
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=parameter_grid,
    n_iter=40,
    random_state=0,
    n_jobs=-1,
    verbose=1,
)

print("Performing grid search...")
print("Hyperparameters to be evaluated:")
pprint(parameter_grid)

t0 = time()
random_search.fit(combined_data, y_train)
print(f"Done in {time() - t0:.3f}s")

print("Best parameters combination found:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameter_grid.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

test_accuracy = random_search.score(combined_data, y_train)
print(
    "Accuracy of the best parameters using the inner CV of "
    f"the random search: {random_search.best_score_:.3f}"
)
print(f"Accuracy on test set: {test_accuracy:.3f}")

Performing grid search...
Hyperparameters to be evaluated:
{'clf__alpha': array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
       1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
 'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
 'vect__min_df': (1, 3, 5, 10),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__norm': ('l1', 'l2')}
Fitting 5 folds for each of 40 candidates, totalling 200 fits
Done in 1094.452s
Best parameters combination found:
clf__alpha: 1e-06
vect__max_df: 0.4
vect__min_df: 1
vect__ngram_range: (1, 2)
vect__norm: l1
Accuracy of the best parameters using the inner CV of the random search: 0.793
Accuracy on test set: 0.969


    - predicting the test data 

In [None]:
df_test_copy = df_test.copy()

df_test.dropna(inplace=True)
df_test.drop_duplicates(inplace=True)
df_test['question1'] = df_test['question1'].apply(process)
df_test['question2'] = df_test['question2'].apply(process)
test_combine = df_test['question1'] + ' ' + df_test['question2']
y_pred = random_search.predict(test_combine)
y_pred_df = pd.Series(y_pred, "is_duplicate")

df_test.reset_index(drop=True, inplace=True)  # Reset the index of df_test

result = pd.concat([df_test, y_pred_df], axis=1)

print(df_test.shape)
print(result.shape)
print(result['is_duplicate'].count())
print(result['test_id'].count())

In [132]:
result_submission = result[['test_id','is_duplicate']].set_index('test_id')
result_submission
result_submission.to_csv("result_submission.csv")

- The prefixes vect and clf are required to avoid possible ambiguities in the pipeline, but are not necessary for visualizing the results. Because of this, we define a function that will rename the tuned hyperparameters and improve the readability.

In [70]:
def shorten_param(param_name):
    """Remove components' prefixes in param_name."""
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = pd.DataFrame(random_search.cv_results_)
cv_results = cv_results.rename(shorten_param, axis=1)

- We can use a plotly.express.scatter to visualize the trade-off between scoring time and mean test score (i.e. “CV score”). Passing the cursor over a given point displays the corresponding parameters. Error bars correspond to one standard deviation as computed in the different folds of the cross-validation.

In [None]:
import plotly.express as px

param_names = [shorten_param(name) for name in parameter_grid.keys()]
labels = {
    "mean_score_time": "CV Score time (s)",
    "mean_test_score": "CV score (accuracy)",
}
fig = px.scatter(
    cv_results,
    x="mean_score_time",
    y="mean_test_score",
    error_x="std_score_time",
    error_y="std_test_score",
    hover_data=param_names,
    labels=labels,
)
fig.update_layout(
    title={
        "text": "trade-off between scoring time and mean test score",
        "y": 0.95,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)
fig


- We can also use a plotly.express.parallel_coordinates to further visualize the mean test score as a function of the tuned hyperparameters. This helps finding interactions between more than two hyperparameters and provide intuition on their relevance for improving the performance of a pipeline.

- We apply a math.log10 transformation on the alpha axis to spread the active range and improve the readability of the plot. A value 
 on said X axis is to be understood as 10^X


In [None]:
import math

column_results = param_names + ["mean_test_score", "mean_score_time"]

transform_funcs = dict.fromkeys(column_results, lambda x: x)
# Using a logarithmic scale for alpha
transform_funcs["alpha"] = math.log10
# L1 norms are mapped to index 1, and L2 norms to index 2
transform_funcs["norm"] = lambda x: 2 if x == "l2" else 1
# Unigrams are mapped to index 1 and bigrams to index 2
transform_funcs["ngram_range"] = lambda x: x[1]

fig = px.parallel_coordinates(
    cv_results[column_results].apply(transform_funcs),
    color="mean_test_score",
    color_continuous_scale=px.colors.sequential.Viridis_r,
    labels=labels,
)
fig.update_layout(
    title={
        "text": "Parallel coordinates plot of text classifier pipeline",
        "y": 0.99,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    }
)
fig


Total running time of the script: Done in 1094.452s

Performing grid search...

      * Hyperparameters to be evaluated:

      {'clf__alpha': array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
            1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]),
      'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
      'vect__min_df': (1, 3, 5, 10),
      'vect__ngram_range': ((1, 1), (1, 2)),
      'vect__norm': ('l1', 'l2')}

      * Fitting 5 folds for each of 40 candidates, totalling 200 fits

      * Best parameters combination found:

      clf__alpha: 1e-06
      vect__max_df: 0.4
      vect__min_df: 1
      vect__ngram_range: (1, 2)
      vect__norm: l1
      
      Accuracy of the best parameters using the inner CV of the random search: 0.793
      Accuracy on test set: 0.969

Manual Hyperparameter tunning

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
import numpy as np

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

best_accuracy = 0
best_params = {}

parameter_grid = {
    "vect__max_df": [0.2, 0.4, 0.6, 0.8, 1.0],
    "vect__min_df": [1, 3, 5, 10],
    "vect__ngram_range": [(1, 1), (1, 2)],
    "vect__norm": ["l1", "l2"],
    "clf__alpha": np.logspace(-6, 6, 13),
}

for params in parameter_grid:
    vectorizer.set_params(params)
    X_train_tfidf = vectorizer.transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    clf = ComplementNB()
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("Best parameters combination found:")
print(best_params)
print("Best accuracy:", best_accuracy)
