In [18]:
import pandas as pd
import re
from nltk.corpus import stopwords
import fuzzywuzzy
import distance

from fuzzywuzzy import fuzz
from distance import lcsubstrings
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

# Load the cleaned dataset
df = pd.read_csv("quora_cleaned_train.csv")


In [2]:
df.head()  # Display the first few rows of the dataset

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10,13,0.769231,2,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,4,16,0.25,5,3
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1,1,73,59,14,10,4,20,0.2,2,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,1,50,65,11,9,0,19,0.0,2,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3,1,76,39,13,7,2,18,0.111111,4,2


In [16]:
# Data preprocessing
SAFE_DIV = 0.0001

STOP_WORDS = stopwords.words("english")

# Extended contractions dictionary
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

def preprocess(x):
    x = str(x).lower()

    # Existing replacements (unchanged)
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
         .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
         .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
         .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
         .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
         .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
         .replace("€", " euro ").replace("'ll", " will")

    # Additional replacements
    x = x.replace('@', ' at ')
    x = x.replace('[math]', '')

    x = x.replace(',000,000,000 ', 'b ')
    x = x.replace(',000,000 ', 'm ')
    x = x.replace(',000 ', 'k ')
    x = re.sub(r'([0-9]+)000000000', r'\1b', x)
    x = re.sub(r'([0-9]+)000000', r'\1m', x)
    x = re.sub(r'([0-9]+)000', r'\1k', x)

    # Expand contractions from dictionary
    x_tokens = []
    for word in x.split():
        if word in contractions:
            word = contractions[word]
        x_tokens.append(word)
    x = ' '.join(x_tokens)

    # Continue with original code flow
    porter = PorterStemmer()
    pattern = re.compile('\W')

    if isinstance(x, str):
        x = re.sub(pattern, ' ', x)

    if isinstance(x, str):
        x = porter.stem(x)
        example1 = BeautifulSoup(x, features="html.parser")
        x = example1.get_text()

    return x


  pattern = re.compile('\W')


In [23]:
# raw questions → preprocessing → all features → final dataset,



def get_basic_features(q1, q2):
    """
    Calculates basic features on preprocessed questions
    """
    q1len = len(q1)
    q2len = len(q2)
    
    q1_n_words = len(q1.split())
    q2_n_words = len(q2.split())
    
    q1_words_set = set(q1.split())
    q2_words_set = set(q2.split())
    word_Common = len(q1_words_set & q2_words_set)
    word_Total = len(q1_words_set) + len(q2_words_set)
    word_share = word_Common / (word_Total + SAFE_DIV)
    
    freq_q1_plus_q2 = q1len + q2len
    freq_q1_minus_q2 = abs(q1len - q2len)
    
    return [q1len, q2len, q1_n_words, q2_n_words, word_Common, word_Total, word_share, freq_q1_plus_q2, freq_q1_minus_q2]

# 2) Token features function (already exists)
def get_token_features(q1, q2):
    token_features = [0.0]*10
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    common_word_count = len(q1_words.intersection(q2_words))
    common_stop_count = len(q1_stops.intersection(q2_stops))
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))

    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
    
    return token_features

# 3) Longest common substring ratio
def get_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

# 4) Main feature extraction function
def extract_all_features_quora(df):
    """
    Preprocesses the questions and calculates all basic + advanced features
    Updates the dataframe df in-place
    """
    # Step 1: Preprocess questions
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("Calculating basic features...")
    basic_features = df.apply(lambda x: get_basic_features(x["question1"], x["question2"]), axis=1)
    df["q1len"] = list(map(lambda x: x[0], basic_features))
    df["q2len"] = list(map(lambda x: x[1], basic_features))
    df["q1_n_words"] = list(map(lambda x: x[2], basic_features))
    df["q2_n_words"] = list(map(lambda x: x[3], basic_features))
    df["word_Common"] = list(map(lambda x: x[4], basic_features))
    df["word_Total"] = list(map(lambda x: x[5], basic_features))
    df["word_share"] = list(map(lambda x: x[6], basic_features))
    df["freq_q1+q2"] = list(map(lambda x: x[7], basic_features))
    df["freq_q1-q2"] = list(map(lambda x: x[8], basic_features))
    
    # Step 2: Token features
    print("Calculating token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))
    
    # Step 3: Fuzzy features
    print("Calculating fuzzy features...")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    
    # Step 4: Longest substring ratio
    print("Calculating longest substring ratio...")
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    
    print("All features successfully added to the dataframe!")
    return df

quora_cleaned_train = extract_all_features_quora(df)
# Apply to your dataset



Calculating basic features...
Calculating token features...
Calculating fuzzy features...
Calculating longest substring ratio...
All features successfully added to the dataframe!


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,1,1,66,57,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,4,1,51,88,...,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,1,1,73,59,...,0.285712,0.0,1.0,4.0,12.0,66,66,54,54,0.166667
3,3,7,8,why am i mentally very lonely how can i solve...,find the remainder when 23 24 math is divi...,0,1,1,50,59,...,0.0,0.0,0.0,1.0,11.5,36,37,36,40,0.039216
4,4,9,10,which one dissolve in water quikly sugar salt...,which fish would survive in salt water,0,3,1,76,39,...,0.30769,0.0,1.0,6.0,10.0,67,47,46,56,0.175


In [21]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10,13,0.769231,2,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,4,16,0.25,5,3
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,1,1,73,59,14,10,4,20,0.2,2,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,1,1,50,65,11,9,0,19,0.0,2,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,3,1,76,39,13,7,2,18,0.111111,4,2


In [24]:
# Save the updated DataFrame to CSV
quora_cleaned_train.to_csv("quora_cleaned_train_final.csv", index=False)

print("Final dataset saved as 'quora_cleaned_train_final.csv'")


Final dataset saved as 'quora_cleaned_train_final.csv'
