## Imports

In [28]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import catboost as cb
from scipy.spatial.distance import cosine, euclidean
from scipy.stats import pearsonr, spearmanr
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [29]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

## Data reading

In [4]:
!unzip /kaggle/input/quora-question-pairs/train.csv.zip

Archive:  /kaggle/input/quora-question-pairs/train.csv.zip
  inflating: train.csv               


In [30]:
df = pd.read_csv("train.csv")
df.sample(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
288536,288536,409503,409504,Which one is a better car to buy Fiat Punto pe...,How does a petrol car engine work using LPG in...,0
92228,92228,154436,154437,Are women attracted to female nudity?,Are women attracted to testosterone?,0
160814,160814,250726,250727,What are some good and cheap hotels or hostels...,What are some good and cheap hotels or hostels...,0
394296,394296,527180,527181,How long could a human survive on just peanut ...,Has peanut butter been banned from schools?,0
323251,323251,449197,449198,What are the pros and cons of buying a propert...,I have Rs. 9 lacs of black money (was collecti...,0
29993,29993,55444,55445,Why are fungi considered to be bacteria?,Are fungi considered a bacteria?,1
250882,250882,574,110220,If there will be a war between India and Pakis...,If war happens between India and Pakistan who ...,1
387452,387452,376061,519776,If we can see distant galaxy using Hubble or s...,Could the Hubble see Apollo debris if we aimed...,1
119370,119370,193827,193828,How do I break my knee?,How do I break my elbow?,0
185109,185109,56388,272537,Which is the largest organ in the human body?,What is the largest organ of the body?,1


In [31]:
df.dropna(inplace=True)

## Baseline

In [17]:
class QuoraBaselineAdvanced:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = WordNetLemmatizer()
        self.tfidf_vectorizer = None
        self.models = {}
        
    def preprocess_text(self, text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def advanced_preprocess(self, text):
        if pd.isna(text):
            return ""
        text = self.preprocess_text(text)
        
        words = text.split()
        words = [word for word in words if word not in self.stop_words]
        words = [self.stemmer.lemmatize(word) for word in words]
        
        return ' '.join(words)

    def extract_basic_features(self, q1, q2):
        features = {}

        features['q1_len'] = len(str(q1))
        features['q2_len'] = len(str(q2))
        features['len_diff'] = abs(features['q1_len'] - features['q2_len'])
        features['len_ratio'] = min(features['q1_len'], features['q2_len']) / max(features['q1_len'], features['q2_len'])

        q1_words = str(q1).split()
        q2_words = str(q2).split()
        features['q1_words'] = len(q1_words)
        features['q2_words'] = len(q2_words)
        features['words_diff'] = abs(features['q1_words'] - features['q2_words'])
        
        common_words = set(q1_words) & set(q2_words)
        features['common_words'] = len(common_words)
        features['common_words_ratio'] = len(common_words) / (len(set(q1_words) | set(q2_words)) + 1)
        
        union_words = set(q1_words) | set(q2_words)
        features['jaccard'] = len(common_words) / len(union_words) if union_words else 0
        
        return features

    def extract_advanced_features(self, q1, q2):
        features = self.extract_basic_features(q1, q2)
        
        q1_proc = self.advanced_preprocess(q1)
        q2_proc = self.advanced_preprocess(q2)
        
        q1_words_proc = q1_proc.split()
        q2_words_proc = q2_proc.split()
        
        common_words_proc = set(q1_words_proc) & set(q2_words_proc)
        features['common_words_proc'] = len(common_words_proc)
        features['jaccard_proc'] = len(common_words_proc) / len(set(q1_words_proc) | set(q2_words_proc)) if (q1_words_proc or q2_words_proc) else 0
        
        q1_chars = set(str(q1).lower())
        q2_chars = set(str(q2).lower())
        common_chars = q1_chars & q2_chars
        features['char_jaccard'] = len(common_chars) / len(q1_chars | q2_chars) if (q1_chars or q2_chars) else 0

        features['edit_distance'] = self.levenshtein_distance(str(q1).lower(), str(q2).lower())
        features['edit_distance_norm'] = features['edit_distance'] / max(len(str(q1)), len(str(q2)))
        
        return features

    def levenshtein_distance(self, s1, s2):
        if len(s1) < len(s2):
            return self.levenshtein_distance(s2, s1)
        
        if len(s2) == 0:
            return len(s1)
        
        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        
        return previous_row[-1]

    def prepare_features(self, df):
        feature_list = []
        for idx, row in df.iterrows():
            features = self.extract_advanced_features(row['question1'], row['question2'])
            feature_list.append(features)
        
        feature_df = pd.DataFrame(feature_list)
        questions = list(df['question1'].fillna('')) + list(df['question2'].fillna(''))
        
        if self.tfidf_vectorizer is None:
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=5000,
                ngram_range=(1, 2),
                stop_words='english',
                lowercase=True
            )
            self.tfidf_vectorizer.fit(questions)
        
        q1_tfidf = self.tfidf_vectorizer.transform(df['question1'].fillna(''))
        q2_tfidf = self.tfidf_vectorizer.transform(df['question2'].fillna(''))
        
        tfidf_cosine = []
        for i in range(q1_tfidf.shape[0]):
            cos_sim = 1 - cosine(q1_tfidf[i].toarray().flatten(), q2_tfidf[i].toarray().flatten())
            tfidf_cosine.append(cos_sim if not np.isnan(cos_sim) else 0)
        
        feature_df['tfidf_cosine'] = tfidf_cosine
        
        tfidf_stats = []
        for i in range(q1_tfidf.shape[0]):
            v1 = q1_tfidf[i].toarray().flatten()
            v2 = q2_tfidf[i].toarray().flatten()
            
            eucl_dist = euclidean(v1, v2)
            
            try:
                pearson_corr = pearsonr(v1, v2)[0]
                if np.isnan(pearson_corr):
                    pearson_corr = 0
            except:
                pearson_corr = 0
            
            tfidf_stats.append({
                'tfidf_euclidean': eucl_dist,
                'tfidf_pearson': pearson_corr
            })
        
        tfidf_stats_df = pd.DataFrame(tfidf_stats)
        feature_df = pd.concat([feature_df, tfidf_stats_df], axis=1)
        
        return feature_df

In [10]:
def train_baseline_models(X_train, y_train, X_val, y_val):
    models_config = {
        'logistic': LogisticRegression(random_state=42, max_iter=1000),
        'random_forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'gradient_boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'xgboost': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),
        'catboost': cb.CatBoostClassifier(iterations=100, random_seed=42, verbose=False)
    }
    
    results = {}
    
    for name, model in models_config.items():
        print(f"Training {name}...")
        
        model.fit(X_train, y_train)
        
        train_pred = model.predict_proba(X_train)[:, 1]
        val_pred = model.predict_proba(X_val)[:, 1]
        val_pred_binary = model.predict(X_val)
        
        train_logloss = log_loss(y_train, train_pred)
        val_logloss = log_loss(y_val, val_pred)
        val_accuracy = accuracy_score(y_val, val_pred_binary)
        val_auc = roc_auc_score(y_val, val_pred)
        val_f1 = f1_score(y_val, val_pred_binary)
        
        results[name] = {
            'model': model,
            'train_logloss': train_logloss,
            'val_logloss': val_logloss,
            'val_accuracy': val_accuracy,
            'val_auc': val_auc,
            'val_f1': val_f1,
            'val_predictions': val_pred
        }
        
        print(f"{name} - Val LogLoss: {val_logloss:.4f}, Val AUC: {val_auc:.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    return results

In [33]:
def create_ensemble(predictions_dict, y_val, method='average'):
        print(f"Создание ансамбля методом {method}...")
        
        if method == 'average':
            ensemble_pred = np.mean([pred for pred in predictions_dict.values()], axis=0)
        
        elif method == 'weighted':
            weights = []
            preds = []
            for name, pred in predictions_dict.items():
                auc = roc_auc_score(y_val, pred)
                weights.append(auc)
                preds.append(pred)
            
            weights = np.array(weights)
            weights = weights / weights.sum()
            
            ensemble_pred = np.average(preds, axis=0, weights=weights)
        
        elif method == 'stacking':
            stacking_features = np.column_stack(list(predictions_dict.values()))
            stacking_model = LogisticRegression(random_state=42)
            
            kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            stacking_pred = np.zeros(len(y_val))
            
            for train_idx, val_idx in kf.split(stacking_features, y_val):
                stacking_model.fit(stacking_features[train_idx], y_val.iloc[train_idx])
                stacking_pred[val_idx] = stacking_model.predict_proba(stacking_features[val_idx])[:, 1]
            
            ensemble_pred = stacking_pred
        
        ensemble_logloss = log_loss(y_val, ensemble_pred)
        ensemble_auc = roc_auc_score(y_val, ensemble_pred)
        ensemble_accuracy = accuracy_score(y_val, ensemble_pred > 0.5)
        ensemble_f1 = f1_score(y_val, ensemble_pred > 0.5)
        
        print(f"Ensemble - LogLoss: {ensemble_logloss:.4f}, AUC: {ensemble_auc:.4f}, Accuracy: {ensemble_accuracy:.4f}")
        
        return ensemble_pred, {
            'logloss': ensemble_logloss,
            'auc': ensemble_auc,
            'accuracy': ensemble_accuracy,
            'f1': ensemble_f1
        }

In [34]:
X = df[['question1', 'question2']]
y = df['is_duplicate']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

In [35]:
baseline = QuoraBaselineAdvanced()

In [36]:
print("features preparation")
train_features = baseline.prepare_features(X_train)
val_features = baseline.prepare_features(X_val)

print("Training models")
results = train_baseline_models(train_features, y_train, val_features, y_val)
predictions = {name: result['val_predictions'] for name, result in results.items()}

best_model_name = min(results.keys(), key=lambda x: results[x]['val_logloss'])
print(f"\n=== RESULTS ===")
print(f"Best model: {best_model_name}")

data = {
    'models': results,
    'best_model': best_model_name,
    'train_features': train_features,
    'val_features': val_features
}

features preparation
Training models
Training logistic...
logistic - Val LogLoss: 0.5371, Val AUC: 0.7693, Val Accuracy: 0.6861
Training random_forest...
random_forest - Val LogLoss: 0.4582, Val AUC: 0.8408, Val Accuracy: 0.7494
Training gradient_boosting...
gradient_boosting - Val LogLoss: 0.4701, Val AUC: 0.8257, Val Accuracy: 0.7379
Training xgboost...
xgboost - Val LogLoss: 0.4505, Val AUC: 0.8405, Val Accuracy: 0.7519
Training catboost...
catboost - Val LogLoss: 0.4538, Val AUC: 0.8380, Val Accuracy: 0.7493

=== RESULTS ===
Best model: xgboost


In [37]:
ensemble_results = {}
for method in ['average', 'weighted', 'stacking']:
    ens_pred, ens_metrics = create_ensemble(predictions, y_val, method)
    ensemble_results[method] = {'predictions': ens_pred, 'metrics': ens_metrics}

Создание ансамбля методом average...
Ensemble - LogLoss: 0.4582, AUC: 0.8407, Accuracy: 0.7518
Создание ансамбля методом weighted...
Ensemble - LogLoss: 0.4575, AUC: 0.8410, Accuracy: 0.7519
Создание ансамбля методом stacking...
Ensemble - LogLoss: 0.4560, AUC: 0.8458, Accuracy: 0.7559


In [38]:
best_ensemble_name = min(ensemble_results.keys(), key=lambda x: ensemble_results[x]['metrics']['logloss'])
best_ensemble_name

'stacking'

## Submission

In [15]:
test = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv")
test.shape

(2345796, 3)

In [20]:
# test_preprocessed = baseline.prepare_features(test)

In [39]:
test_preprocessed = pd.read_csv("preprocessed.csv")

In [27]:
# test_preprocessed.to_csv("preprocessed.csv", index=False)

In [42]:
predictions

{'logistic': array([0.27471076, 0.66958216, 0.6613475 , ..., 0.65240735, 0.68572857,
        0.40649905]),
 'random_forest': array([0.52, 0.64, 0.78, ..., 0.71, 0.67, 0.57]),
 'gradient_boosting': array([0.30958459, 0.45555244, 0.71000844, ..., 0.66781109, 0.63860876,
        0.48394027]),
 'xgboost': array([0.3557791, 0.6136866, 0.6896051, ..., 0.8363831, 0.6733017,
        0.5013521], dtype=float32),
 'catboost': array([0.21536092, 0.57247852, 0.74645389, ..., 0.84655037, 0.60469532,
        0.52922467])}

In [43]:
stacking_features = np.column_stack(list(predictions.values()))
stacking_model = LogisticRegression(random_state=42)

kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
stacking_pred = np.zeros(len(y_val))

for train_idx, val_idx in kf.split(stacking_features, y_val):
    stacking_model.fit(stacking_features[train_idx], y_val.iloc[train_idx])
    stacking_pred[val_idx] = stacking_model.predict_proba(stacking_features[val_idx])[:, 1]

In [44]:
stacking_model

In [45]:
test_preprocessed.head()

Unnamed: 0,q1_len,q2_len,len_diff,len_ratio,q1_words,q2_words,words_diff,common_words,common_words_ratio,jaccard,common_words_proc,jaccard_proc,char_jaccard,edit_distance,edit_distance_norm,tfidf_cosine,tfidf_euclidean,tfidf_pearson
0,57,68,11,0.838235,11,14,3,2,0.086957,0.090909,3,0.272727,0.818182,49,0.720588,0.358764,1.132463,0.358077
1,66,43,23,0.651515,14,7,7,4,0.222222,0.235294,4,0.5,0.782609,45,0.681818,0.635087,0.854298,0.63474
2,60,29,31,0.483333,14,6,8,4,0.266667,0.285714,3,0.5,0.842105,34,0.566667,0.799067,0.633929,0.798939
3,27,17,10,0.62963,4,3,1,0,0.0,0.0,1,0.333333,0.631579,15,0.555556,0.0,1.414214,-0.0002
4,32,30,2,0.9375,4,6,2,3,0.375,0.428571,2,0.666667,0.777778,12,0.375,1.0,0.0,1.0


In [63]:
models_ensemble = []

for model_name, details in data["models"].items():
    models_ensemble.append(details["model"])

In [61]:
for indx, x in test_preprocessed.iterrows():
    predictions = []
    for estimator in models_ensemble:
        predictions.append(estimator.predict_proba(x.to_frame().T)[:, 1])
    predictions = np.array(predictions).reshape(-1, 5)
    print(predictions.shape)
    print(predictions)

    result = stacking_model.predict(predictions)
    print(result)
    break

(1, 5)
[[0.19741737 0.02       0.08656675 0.04407485 0.06790519]]
[0]


In [65]:
estimator_predictions = []

for estimator in models_ensemble:
    estimator_predictions.append(estimator.predict_proba(test_preprocessed)[:, 1])

predictions = np.array(estimator_predictions).reshape(-1, 5)
results = stacking_model.predict(predictions)

In [66]:
len(results)

2345796

In [67]:
pd.DataFrame({
    "test_id": list(range(len(test_preprocessed))),
    "is_duplicate": results.tolist()
}).to_csv("submit_baseline_advanced_ensembles.csv", index=False)