## Imports

In [85]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import catboost as cb
from scipy.spatial.distance import cosine, euclidean
from scipy.stats import pearsonr, spearmanr
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

In [86]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

## Data reading

In [71]:
!unzip /kaggle/input/quora-question-pairs/train.csv.zip

Archive:  /kaggle/input/quora-question-pairs/train.csv.zip
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [72]:
!unzip /kaggle/input/quora-question-pairs/test.csv.zip

Archive:  /kaggle/input/quora-question-pairs/test.csv.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [87]:
df = pd.read_csv("train.csv")
df.sample(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
226652,226652,140493,115923,How do I stay motivated daily?,How do I stay motivated in life?,1
3313,3313,6567,6568,Who are these Rothschilds? Do they really own ...,Who controls the central banks of the world?,0
181336,181336,277811,277812,What happens to the money collected everyday i...,Is it safe to deposit money in Axis Bank?,0
176242,176242,271206,216882,Which investment banks still have prop trading...,What happened to Goldman's prop trading desk a...,0
19490,19490,36825,36826,Calculus: How do we solve these definite integ...,Can anybody solve this calculus problem?,0
207169,207169,310792,310793,What is the best drama (genre) movie?,Which are the best drama movies of all time?,1
59185,59185,103713,103714,"How does ""Andy English Bot"" Facebook page send...",What does it mean on a persons profile picture...,0
366356,366356,191245,496545,Why are many economists against demonetization?,Why are many leading economists sceptical abou...,1
249476,249476,46041,363124,"On a train in India, what does SL, 1A, 2A, 3A,...",Why do indian central railway express train n...,0
96845,96845,161244,161245,What does it mean when a guy calls you perfect...,What does it mean when a guy calls you this?,0


In [88]:
df.dropna(inplace=True)

## Baseline

In [89]:
class QuoraBaseline:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.tfidf_vectorizer = None
        self.models = {}
        
    def preprocess_text(self, text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def extract_basic_features(self, q1, q2):
        features = {}

        features['q1_len'] = len(str(q1))
        features['q2_len'] = len(str(q2))
        features['len_diff'] = abs(features['q1_len'] - features['q2_len'])
        features['len_ratio'] = min(features['q1_len'], features['q2_len']) / max(features['q1_len'], features['q2_len'])

        q1_words = str(q1).split()
        q2_words = str(q2).split()
        features['q1_words'] = len(q1_words)
        features['q2_words'] = len(q2_words)
        features['words_diff'] = abs(features['q1_words'] - features['q2_words'])
        
        common_words = set(q1_words) & set(q2_words)
        features['common_words'] = len(common_words)
        features['common_words_ratio'] = len(common_words) / (len(set(q1_words) | set(q2_words)) + 1)
        
        union_words = set(q1_words) | set(q2_words)
        features['jaccard'] = len(common_words) / len(union_words) if union_words else 0
        
        return features

    def levenshtein_distance(self, s1, s2):
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]

    def prepare_features(self, df):
        feature_list = []
        for idx, row in df.iterrows():
            features = self.extract_basic_features(row['question1'], row['question2'])
            feature_list.append(features)
        
        feature_df = pd.DataFrame(feature_list)
        questions = list(df['question1'].fillna('')) + list(df['question2'].fillna(''))
        
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                stop_words='english',
                lowercase=True
            )
            self.tfidf_vectorizer.fit(questions)
        
        q1_tfidf = self.tfidf_vectorizer.transform(df['question1'].fillna(''))
        q2_tfidf = self.tfidf_vectorizer.transform(df['question2'].fillna(''))
        
        tfidf_cosine = []
        for i in range(q1_tfidf.shape[0]):
            cos_sim = 1 - cosine(q1_tfidf[i].toarray().flatten(), q2_tfidf[i].toarray().flatten())
            tfidf_cosine.append(cos_sim if not np.isnan(cos_sim) else 0)
        
        feature_df['tfidf_cosine'] = tfidf_cosine
        
        tfidf_stats = []
        for i in range(q1_tfidf.shape[0]):
            v1 = q1_tfidf[i].toarray().flatten()
            v2 = q2_tfidf[i].toarray().flatten()
            
            eucl_dist = euclidean(v1, v2)
            
            try:
                pearson_corr = pearsonr(v1, v2)[0]
                if np.isnan(pearson_corr):
                    pearson_corr = 0
            except:
                pearson_corr = 0
            
            tfidf_stats.append({
                'tfidf_euclidean': eucl_dist,
                'tfidf_pearson': pearson_corr
            })
        
        tfidf_stats_df = pd.DataFrame(tfidf_stats)
        feature_df = pd.concat([feature_df, tfidf_stats_df], axis=1)
        
        return feature_df

In [90]:
def train_baseline_models(X_train, y_train, X_val, y_val):
    models_config = {
        'logistic': LogisticRegression(random_state=42, max_iter=1000),
        'random_forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'gradient_boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'xgboost': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),
        'catboost': cb.CatBoostClassifier(iterations=100, random_seed=42, verbose=False)
    }
    
    results = {}
    
    for name, model in models_config.items():
        print(f"Training {name}...")
        
        model.fit(X_train, y_train)
        
        train_pred = model.predict_proba(X_train)[:, 1]
        val_pred = model.predict_proba(X_val)[:, 1]
        val_pred_binary = model.predict(X_val)
        
        train_logloss = log_loss(y_train, train_pred)
        val_logloss = log_loss(y_val, val_pred)
        val_accuracy = accuracy_score(y_val, val_pred_binary)
        val_auc = roc_auc_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred_binary)
        
        results[name] = {
            'model': model,
            'train_logloss': train_logloss,
            'val_logloss': val_logloss,
            'val_accuracy': val_accuracy,
            'val_auc': val_auc,
            'val_f1': val_f1,
            'val_predictions': val_pred
        }
        
        print(f"{name} - Val LogLoss: {val_logloss:.4f}, Val AUC: {val_auc:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    return results

In [93]:
X = df[['question1', 'question2']]
y = df['is_duplicate']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

In [94]:
baseline = QuoraBaseline()

print("features preparation")
train_features = baseline.prepare_features(X_train)
val_features = baseline.prepare_features(X_val)

print("Training models")
results = train_baseline_models(train_features, y_train, val_features, y_val)
predictions = {name: result['val_predictions'] for name, result in results.items()}

best_model_name = min(results.keys(), key=lambda x: results[x]['val_logloss'])
print(f"\n=== RESULTS ===")
print(f"Best model: {best_model_name}")

data = {
    'models': results,
    'best_model': best_model_name,
    'train_features': train_features,
    'val_features': val_features
}

features preparation
Training models
Training logistic...
logistic - Val LogLoss: 0.5658, Val AUC: 0.7287, Val Accuracy: 0.6584
Training random_forest...
random_forest - Val LogLoss: 0.5539, Val AUC: 0.7935, Val Accuracy: 0.7111
Training gradient_boosting...
gradient_boosting - Val LogLoss: 0.5047, Val AUC: 0.7903, Val Accuracy: 0.7120
Training xgboost...
xgboost - Val LogLoss: 0.4890, Val AUC: 0.8057, Val Accuracy: 0.7224
Training catboost...
catboost - Val LogLoss: 0.4921, Val AUC: 0.8029, Val Accuracy: 0.7203

=== RESULTS ===
Best model: xgboost


In [95]:
data

{'models': {'logistic': {'model': LogisticRegression(max_iter=1000, random_state=42),
   'train_logloss': 0.5647437851947061,
   'val_logloss': 0.565820485109431,
   'val_accuracy': 0.6583503726667106,
   'val_auc': 0.7287010191136077,
   'val_f1': 0.5139465596922138,
   'val_predictions': array([0.26831601, 0.59542217, 0.46552821, ..., 0.52229541, 0.59128695,
          0.49462337])},
  'random_forest': {'model': RandomForestClassifier(n_jobs=-1, random_state=42),
   'train_logloss': 0.14731619797803006,
   'val_logloss': 0.5539269381581893,
   'val_accuracy': 0.7110678715124332,
   'val_auc': 0.7935113144309633,
   'val_f1': 0.6120017714791851,
   'val_predictions': array([0.62, 0.73, 0.4 , ..., 0.69, 0.68, 0.43])},
  'gradient_boosting': {'model': GradientBoostingClassifier(random_state=42),
   'train_logloss': 0.5039169209221429,
   'val_logloss': 0.5046901637037331,
   'val_accuracy': 0.7119912934503001,
   'val_auc': 0.790325619149012,
   'val_f1': 0.6503583296632902,
   'val_pred

## Submission

In [100]:
test = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv")
test.shape

(2345796, 3)

In [101]:
test_preprocessed = baseline.prepare_features(test)

In [109]:
best = data["models"][best_model_name]["model"]
best

In [117]:
test_preprocessed.head()

Unnamed: 0,q1_len,q2_len,len_diff,len_ratio,q1_words,q2_words,words_diff,common_words,common_words_ratio,jaccard,tfidf_cosine,tfidf_euclidean,tfidf_pearson
0,57,68,11,0.838235,11,14,3,2,0.086957,0.090909,0.354671,1.136071,0.353985
1,66,43,23,0.651515,14,7,7,4,0.222222,0.235294,0.640689,0.847716,0.640347
2,60,29,31,0.483333,14,6,8,4,0.266667,0.285714,0.805724,0.62334,0.805601
3,27,17,10,0.62963,4,3,1,0,0.0,0.0,0.0,1.414214,-0.0002
4,32,30,2,0.9375,4,6,2,3,0.375,0.428571,1.0,0.0,1.0


In [118]:
# for indx, x in test_preprocessed.iterrows():
#     print(x.to_frame().T)
#     result = best.predict(x.to_frame().T)
#     print(result)
#     break


test_results = best.predict(test_preprocessed)

In [119]:
len(test_results)

2345796

In [120]:
test_results

array([0, 1, 1, ..., 0, 0, 0])

In [122]:
pd.DataFrame({
    "test_id": list(range(len(test_preprocessed))),
    "is_duplicate": test_results.tolist()
}).to_csv("submit_baseline.csv", index=False)