In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [None]:
train_dir='data/train'
train_csv='data/train.csv'

labels = pd.read_csv(train_csv)
examples= []
for idx, row in labels.iterrows():
    article_id = f"article_{row['id']:04d}"
    article_path =os.path.join(train_dir, article_id)
    file1_path =os.path.join(article_path,'file_1.txt')
    file2_path =os.path.join(article_path,'file_2.txt')
    with open(file1_path,'r', encoding='utf-8') as f1:
        text1=f1.read()
    with open(file2_path,'r', encoding='utf-8') as f2:
        text2=f2.read()
    examples.append({
        'id': row['id'],
        'text_1':text1,
        'text_2': text2,
        'real_text_id':row['real_text_id']
    })
train_df =pd.DataFrame(examples)
print("Train loaded", train_df.shape)

Train loaded (95, 4)


In [None]:
test_dir ='data/test'
test_ids = sorted([fn for fn in os.listdir(test_dir) if fn.startswith("article_")])
test_examples = []
for article_id in test_ids:
    article_path = os.path.join(test_dir, article_id)
    file1_path = os.path.join(article_path, 'file_1.txt')
    file2_path = os.path.join(article_path, 'file_2.txt')
    with open(file1_path, 'r',encoding='utf-8') as f1:
        text1 = f1.read()
    with open(file2_path, 'r',encoding='utf-8') as f2:
        text2 = f2.read()
    test_examples.append({
        'id': int(article_id.split('_')[1]),
        'text_1': text1,
        'text_2': text2,
    })
test_df =pd.DataFrame(test_examples)
print("Test loaded", test_df.shape)

Test loaded (1068, 3)


In [None]:
all_texts =train_df['text_1'].tolist() + train_df['text_2'].tolist() + test_df['text_1'].tolist() + test_df['text_2'].tolist()
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,3), stop_words='english')
tfidf.fit(all_texts)

def text_pair_feats(df):
    v1 = tfidf.transform(df['text_1'])
    v2 = tfidf.transform(df['text_2'])
    diff = v1 - v2
    concat = np.hstack([v1.toarray(), v2.toarray(), diff.toarray()])
    return concat

X = text_pair_feats(train_df)
y = (train_df['real_text_id'] == 1).astype(int).values   #1 if text_1 is real, else 0

X_test = text_pair_feats(test_df)

In [None]:
#cross validated lr
oof =np.zeros(len(X))
preds = np.zeros(len(X_test))

skf =StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X, y):
    model = LogisticRegression(max_iter=500, C=2.0)
    model.fit(X[train_idx], y[train_idx])
    oof[val_idx] = model.predict_proba(X[val_idx])[:, 1]
    preds += model.predict_proba(X_test)[:, 1] / skf.n_splits

print("Validation AUC OOF:", np.mean((oof>0.5)==y))



Validation AUC OOF: 0.8842105263157894


In [7]:
real_text_id_pred = [1 if p>0.5 else 2 for p in preds]
submission = pd.DataFrame({'id': test_df['id'], 'real_text_id': real_text_id_pred})
submission = submission.sort_values('id').reset_index(drop=True)
submission.to_csv('s3.csv', index=False)
submission.head()

Unnamed: 0,id,real_text_id
0,0,2
1,1,2
2,2,1
3,3,1
4,4,2
