In [2]:
# importing requirments
import pandas as pd
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [3]:
# Download nltk punkt tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
# Load your dataset 
df = pd.read_csv(r"C:\Users\user\Desktop\project\Plagiarism_Checker_AI\Dataset\train_snli.txt", sep='\t')

## DATA PREPROCESSING

In [25]:
# assigning column names
df.columns = ['sentence1', 'sentence2', 'label']
df = df.dropna(subset=['sentence1', 'sentence2'])

In [26]:
# Tokenize sentences for SLM training
tokenized_s1 = [word_tokenize(sent.lower()) for sent in df['sentence1']]
tokenized_s2 = [word_tokenize(sent.lower()) for sent in df['sentence2']]


In [27]:
# Prepare training data for 3-gram SLM
n = 3
train_data_s1, vocab_s1 = padded_everygram_pipeline(n, tokenized_s1)
train_data_s2, vocab_s2 = padded_everygram_pipeline(n, tokenized_s2)


In [28]:
# Train two separate 3-gram models
model_s1 = MLE(n)
model_s1.fit(train_data_s1, vocab_s1)

model_s2 = MLE(n)
model_s2.fit(train_data_s2, vocab_s2)

In [29]:
# Function to compute log probability of a sentence for a model
import math
def sentence_log_prob(model, sentence):
    tokens = word_tokenize(sentence.lower())
    ngrams = list(nltk.ngrams(tokens, n, pad_left=True, pad_right=True,
                               left_pad_symbol="<s>", right_pad_symbol="</s>"))
    log_prob = 0.0
    for ng in ngrams:
        context = ng[:-1]
        word = ng[-1]
        prob = model.score(word, context)
        # Avoid log(0) by smoothing zero probabilities
        if prob > 0:
            log_prob += math.log(prob)
        else:
            log_prob += math.log(1e-12)  # Very small smoothing value
    return log_prob


In [30]:
# Compute log probabilities as features
df['log_prob_s1_model'] = df['sentence1'].apply(lambda x: sentence_log_prob(model_s1, x))
df['log_prob_s2_model'] = df['sentence2'].apply(lambda x: sentence_log_prob(model_s2, x))


In [31]:
# Optionally compute absolute difference between scores as another feature
df['abs_log_prob_diff'] = abs(df['log_prob_s1_model'] - df['log_prob_s2_model'])

In [36]:
#normalizing the SLM features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_slm = scaler.fit_transform(df[['log_prob_s1_model', 'log_prob_s2_model', 'abs_log_prob_diff']])

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

# 1. TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
vectorizer.fit(pd.concat([df['sentence1'], df['sentence2']]))

X1_vec = vectorizer.transform(df['sentence1'])
X2_vec = vectorizer.transform(df['sentence2'])

# 2. Add vector comparison features (optional but helpful)
X_diff = abs(X1_vec - X2_vec)
X_mult = X1_vec.multiply(X2_vec)

# 3. Combine sparse vectors into a single matrix
X_tfidf = hstack([X1_vec, X2_vec, X_diff, X_mult])  # shape: (n_samples, 20K or less)

# 4. Combine with SLM log-prob features (dense → sparse)
slm_features = csr_matrix(df[['log_prob_s1_model', 'log_prob_s2_model', 'abs_log_prob_diff']].values)

# 5. Final input: sparse TF-IDF + dense SLM features
X_final = hstack([X_tfidf, slm_features])


In [55]:
# Prepare features and labels for classification
X = df[['log_prob_s1_model', 'log_prob_s2_model', 'abs_log_prob_diff']]
y = df['label']

## MODEL FITTING AND EVALUATION

In [56]:
# 7. Split and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
# 8. Evaluate
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)


train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_pred)
print(f"✅ Training Accuracy: {train_acc:.4f}")
print(f"✅ Testing Accuracy: {test_acc:.4f}")

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


✅ Training Accuracy: 0.8610
✅ Testing Accuracy: 0.8332
Accuracy: 0.8332471350409669
              precision    recall  f1-score   support

           0       0.85      0.81      0.83     36837
           1       0.82      0.86      0.84     36637

    accuracy                           0.83     73474
   macro avg       0.83      0.83      0.83     73474
weighted avg       0.83      0.83      0.83     73474



In [60]:
import joblib

# Save the trained model
joblib.dump(model, 'plagiarism_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save the SLM feature scaler (if used)
joblib.dump(scaler, 'slm_scaler.pkl')

joblib.dump(model_s1, 'ngram_model_s1.pkl')
joblib.dump(model_s2, 'ngram_model_s2.pkl')


['ngram_model_s2.pkl']

## DETECTION DEMO

In [76]:
import nltk
import math
import joblib
from nltk.tokenize import word_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

# download it once before executiton
# nltk.download('punkt')

# Load trained model and scaler
svm_model = joblib.load('plagiarism_model.pkl')
scaler = joblib.load('slm_scaler.pkl')
model_s1 = joblib.load('ngram_model_s1.pkl')
model_s2 = joblib.load('ngram_model_s2.pkl')

# Preprocessing function to compute log-probability
def sentence_log_prob(model, sentence, n=3):
    tokens = word_tokenize(sentence.lower())
    ngrams = list(nltk.ngrams(tokens, n, pad_left=True, pad_right=True,
                               left_pad_symbol="<s>", right_pad_symbol="</s>"))
    log_prob = 0.0
    for ng in ngrams:
        context = ng[:-1]
        word = ng[-1]
        prob = model.score(word, context)
        if prob > 0:
            log_prob += math.log(prob)
        else:
            log_prob += math.log(1e-12)  # Avoid log(0)
    return log_prob

# Inference function
def predict_plagiarism(sentence1, sentence2):
    log_prob1 = sentence_log_prob(model_s1, sentence1)
    log_prob2 = sentence_log_prob(model_s2, sentence2)
    abs_diff = abs(log_prob1 - log_prob2)

    features = [[log_prob1, log_prob2, abs_diff]]
    features_scaled = scaler.transform(features)

    prediction = svm_model.predict(features_scaled)[0]
    proba = svm_model.predict_proba(features_scaled)[0] if hasattr(svm_model, "predict_proba") else None

    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")

    print(f"Prediction: {'Plagiarized' if prediction == 1 else 'Not Plagiarized'}")

    return 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [77]:
predict_plagiarism("The dog barked loudly at night.", "A dog made noise during the night.")


Sentence 1: The dog barked loudly at night.
Sentence 2: A dog made noise during the night.
Prediction: Plagiarized


