## Imports

In [7]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import catboost as cb
from scipy.spatial.distance import cosine, euclidean
from scipy.stats import pearsonr, spearmanr
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

## Data reading

In [4]:
!unzip /kaggle/input/quora-question-pairs/train.csv.zip

Archive:  /kaggle/input/quora-question-pairs/train.csv.zip
  inflating: train.csv               


In [5]:
df = pd.read_csv("train.csv")
df.sample(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
12494,12494,24069,24070,Is Islamic culture friendly with Science Ficti...,"Can you recommend any good discussion, intervi...",0
207219,207219,310849,310850,How difficult is AIATs for medical?,How difficult is aiats medical?,1
362372,362372,492277,261075,"As today is 6th of October, 39 years ago at Yo...",Who won the Yom Kippur War?,1
87668,87668,147608,147609,Is there any calendar app which integrates wit...,Mp ananth Kumar hegde has assaulted a doctor a...,0
115716,115716,4047,188650,Has there been scientific evidence that ghosts...,Would there ever be scientific evidence on gho...,1
17680,17680,33557,33558,How competitive is the hiring process at Chemi...,How competitive is the hiring process at First...,0
218414,218414,11233,63215,If universe expansion create more gravitationa...,"If energy can't be created or destroyed, how c...",1
156144,156144,244439,244440,Is there any historical movie like The Imitati...,Why did Alan Turing abandon Joan in The Imitat...,0
255990,255990,370998,334277,I am using the HTC Desire 816 in dual SIM mode...,How do I download the Lollipop version for the...,0
165644,165644,66488,78416,What are the safety precautions on handling sh...,What are the safety precautions on handling sh...,1


In [6]:
df.dropna(inplace=True)

## Baseline

In [17]:
class QuoraBaselineAdvanced:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = WordNetLemmatizer()
        self.tfidf_vectorizer = None
        self.models = {}
        
    def preprocess_text(self, text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def advanced_preprocess(self, text):
        if pd.isna(text):
            return ""
        text = self.preprocess_text(text)
        
        words = text.split()
        words = [word for word in words if word not in self.stop_words]
        words = [self.stemmer.lemmatize(word) for word in words]
        
        return ' '.join(words)

    def extract_basic_features(self, q1, q2):
        features = {}

        features['q1_len'] = len(str(q1))
        features['q2_len'] = len(str(q2))
        features['len_diff'] = abs(features['q1_len'] - features['q2_len'])
        features['len_ratio'] = min(features['q1_len'], features['q2_len']) / max(features['q1_len'], features['q2_len'])

        q1_words = str(q1).split()
        q2_words = str(q2).split()
        features['q1_words'] = len(q1_words)
        features['q2_words'] = len(q2_words)
        features['words_diff'] = abs(features['q1_words'] - features['q2_words'])
        
        common_words = set(q1_words) & set(q2_words)
        features['common_words'] = len(common_words)
        features['common_words_ratio'] = len(common_words) / (len(set(q1_words) | set(q2_words)) + 1)
        
        union_words = set(q1_words) | set(q2_words)
        features['jaccard'] = len(common_words) / len(union_words) if union_words else 0
        
        return features

    def extract_advanced_features(self, q1, q2):
        features = self.extract_basic_features(q1, q2)
        
        q1_proc = self.advanced_preprocess(q1)
        q2_proc = self.advanced_preprocess(q2)
        
        q1_words_proc = q1_proc.split()
        q2_words_proc = q2_proc.split()
        
        common_words_proc = set(q1_words_proc) & set(q2_words_proc)
        features['common_words_proc'] = len(common_words_proc)
        features['jaccard_proc'] = len(common_words_proc) / len(set(q1_words_proc) | set(q2_words_proc)) if (q1_words_proc or q2_words_proc) else 0
        
        q1_chars = set(str(q1).lower())
        q2_chars = set(str(q2).lower())
        common_chars = q1_chars & q2_chars
        features['char_jaccard'] = len(common_chars) / len(q1_chars | q2_chars) if (q1_chars or q2_chars) else 0

        features['edit_distance'] = self.levenshtein_distance(str(q1).lower(), str(q2).lower())
        features['edit_distance_norm'] = features['edit_distance'] / max(len(str(q1)), len(str(q2)))
        
        return features

    def levenshtein_distance(self, s1, s2):
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]

    def prepare_features(self, df):
        feature_list = []
        for idx, row in df.iterrows():
            features = self.extract_advanced_features(row['question1'], row['question2'])
            feature_list.append(features)
        
        feature_df = pd.DataFrame(feature_list)
        questions = list(df['question1'].fillna('')) + list(df['question2'].fillna(''))
        
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                stop_words='english',
                lowercase=True
            )
            self.tfidf_vectorizer.fit(questions)
        
        q1_tfidf = self.tfidf_vectorizer.transform(df['question1'].fillna(''))
        q2_tfidf = self.tfidf_vectorizer.transform(df['question2'].fillna(''))
        
        tfidf_cosine = []
        for i in range(q1_tfidf.shape[0]):
            cos_sim = 1 - cosine(q1_tfidf[i].toarray().flatten(), q2_tfidf[i].toarray().flatten())
            tfidf_cosine.append(cos_sim if not np.isnan(cos_sim) else 0)
        
        feature_df['tfidf_cosine'] = tfidf_cosine
        
        tfidf_stats = []
        for i in range(q1_tfidf.shape[0]):
            v1 = q1_tfidf[i].toarray().flatten()
            v2 = q2_tfidf[i].toarray().flatten()
            
            eucl_dist = euclidean(v1, v2)
            
            try:
                pearson_corr = pearsonr(v1, v2)[0]
                if np.isnan(pearson_corr):
                    pearson_corr = 0
            except:
                pearson_corr = 0
            
            tfidf_stats.append({
                'tfidf_euclidean': eucl_dist,
                'tfidf_pearson': pearson_corr
            })
        
        tfidf_stats_df = pd.DataFrame(tfidf_stats)
        feature_df = pd.concat([feature_df, tfidf_stats_df], axis=1)
        
        return feature_df

In [10]:
def train_baseline_models(X_train, y_train, X_val, y_val):
    models_config = {
        'logistic': LogisticRegression(random_state=42, max_iter=1000),
        'random_forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'gradient_boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'xgboost': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),
        'catboost': cb.CatBoostClassifier(iterations=100, random_seed=42, verbose=False)
    }
    
    results = {}
    
    for name, model in models_config.items():
        print(f"Training {name}...")
        
        model.fit(X_train, y_train)
        
        train_pred = model.predict_proba(X_train)[:, 1]
        val_pred = model.predict_proba(X_val)[:, 1]
        val_pred_binary = model.predict(X_val)
        
        train_logloss = log_loss(y_train, train_pred)
        val_logloss = log_loss(y_val, val_pred)
        val_accuracy = accuracy_score(y_val, val_pred_binary)
        val_auc = roc_auc_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred_binary)
        
        results[name] = {
            'model': model,
            'train_logloss': train_logloss,
            'val_logloss': val_logloss,
            'val_accuracy': val_accuracy,
            'val_auc': val_auc,
            'val_f1': val_f1,
            'val_predictions': val_pred
        }
        
        print(f"{name} - Val LogLoss: {val_logloss:.4f}, Val AUC: {val_auc:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    return results

In [11]:
X = df[['question1', 'question2']]
y = df['is_duplicate']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

In [19]:
baseline = QuoraBaselineAdvanced()

In [12]:
print("features preparation")
train_features = baseline.prepare_features(X_train)
val_features = baseline.prepare_features(X_val)

print("Training models")
results = train_baseline_models(train_features, y_train, val_features, y_val)
predictions = {name: result['val_predictions'] for name, result in results.items()}

best_model_name = min(results.keys(), key=lambda x: results[x]['val_logloss'])
print(f"\n=== RESULTS ===")
print(f"Best model: {best_model_name}")

data = {
    'models': results,
    'best_model': best_model_name,
    'train_features': train_features,
    'val_features': val_features
}

features preparation
Training models
Training logistic...
logistic - Val LogLoss: 0.5371, Val AUC: 0.7693, Val Accuracy: 0.6861
Training random_forest...
random_forest - Val LogLoss: 0.4582, Val AUC: 0.8408, Val Accuracy: 0.7494
Training gradient_boosting...
gradient_boosting - Val LogLoss: 0.4701, Val AUC: 0.8257, Val Accuracy: 0.7379
Training xgboost...
xgboost - Val LogLoss: 0.4505, Val AUC: 0.8405, Val Accuracy: 0.7519
Training catboost...
catboost - Val LogLoss: 0.4538, Val AUC: 0.8380, Val Accuracy: 0.7493

=== RESULTS ===
Best model: xgboost


In [14]:
data

{'models': {'logistic': {'model': LogisticRegression(max_iter=1000, random_state=42),
   'train_logloss': 0.5353225952876173,
   'val_logloss': 0.5371187001266927,
   'val_accuracy': 0.6860695204801794,
   'val_auc': 0.7693261442585191,
   'val_f1': 0.5464335064563778,
   'val_predictions': array([0.27471076, 0.66958216, 0.6613475 , ..., 0.65240735, 0.68572857,
          0.40649905])},
  'random_forest': {'model': RandomForestClassifier(n_jobs=-1, random_state=42),
   'train_logloss': 0.1256312873012709,
   'val_logloss': 0.45816879636415403,
   'val_accuracy': 0.7493569025789856,
   'val_auc': 0.8407768267549366,
   'val_f1': 0.6701961465023433,
   'val_predictions': array([0.52, 0.64, 0.78, ..., 0.71, 0.67, 0.57])},
  'gradient_boosting': {'model': GradientBoostingClassifier(random_state=42),
   'train_logloss': 0.46877256298499986,
   'val_logloss': 0.47006089460433753,
   'val_accuracy': 0.7379130664204209,
   'val_auc': 0.8257180115946738,
   'val_f1': 0.674169741697417,
   'val_p

## Submission

In [15]:
test = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv")
test.shape

(2345796, 3)

In [20]:
test_preprocessed = baseline.prepare_features(test)

In [21]:
best = data["models"][best_model_name]["model"]
best

In [22]:
test_preprocessed.head()

Unnamed: 0,q1_len,q2_len,len_diff,len_ratio,q1_words,q2_words,words_diff,common_words,common_words_ratio,jaccard,common_words_proc,jaccard_proc,char_jaccard,edit_distance,edit_distance_norm,tfidf_cosine,tfidf_euclidean,tfidf_pearson
0,57,68,11,0.838235,11,14,3,2,0.086957,0.090909,3,0.272727,0.818182,49,0.720588,0.358764,1.132463,0.358077
1,66,43,23,0.651515,14,7,7,4,0.222222,0.235294,4,0.5,0.782609,45,0.681818,0.635087,0.854298,0.63474
2,60,29,31,0.483333,14,6,8,4,0.266667,0.285714,3,0.5,0.842105,34,0.566667,0.799067,0.633929,0.798939
3,27,17,10,0.62963,4,3,1,0,0.0,0.0,1,0.333333,0.631579,15,0.555556,0.0,1.414214,-0.0002
4,32,30,2,0.9375,4,6,2,3,0.375,0.428571,2,0.666667,0.777778,12,0.375,1.0,0.0,1.0


In [23]:
# for indx, x in test_preprocessed.iterrows():
#     print(x.to_frame().T)
#     result = best.predict(x.to_frame().T)
#     print(result)
#     break


test_results = best.predict(test_preprocessed)

In [24]:
len(test_results)

2345796

In [25]:
test_results

array([0, 1, 1, ..., 0, 0, 0])

In [26]:
pd.DataFrame({
    "test_id": list(range(len(test_preprocessed))),
    "is_duplicate": test_results.tolist()
}).to_csv("submit_baseline_advanced.csv", index=False)