In [1]:
# importing requirments
import pandas as pd
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [2]:
# Download nltk punkt tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load your dataset 
df = pd.read_csv(r"C:\Users\user\Desktop\project\Plagiarism_Checker_AI\Dataset\train_snli.txt", sep='\t')

## DATA PREPROCESSING

In [4]:
# assigning column names
df.columns = ['sentence1', 'sentence2', 'label']
df = df.dropna(subset=['sentence1', 'sentence2'])

In [5]:
# Tokenize sentences for SLM training
tokenized_s1 = [word_tokenize(sent.lower()) for sent in df['sentence1']]
tokenized_s2 = [word_tokenize(sent.lower()) for sent in df['sentence2']]


In [6]:
# Prepare training data for 3-gram SLM
n = 3
train_data_s1, vocab_s1 = padded_everygram_pipeline(n, tokenized_s1)
train_data_s2, vocab_s2 = padded_everygram_pipeline(n, tokenized_s2)


In [7]:
# Train two separate 3-gram models
model_s1 = MLE(n)
model_s1.fit(train_data_s1, vocab_s1)

model_s2 = MLE(n)
model_s2.fit(train_data_s2, vocab_s2)

In [8]:
# Function to compute log probability of a sentence for a model
import math
def sentence_log_prob(model, sentence):
    tokens = word_tokenize(sentence.lower())
    ngrams = list(nltk.ngrams(tokens, n, pad_left=True, pad_right=True,
                               left_pad_symbol="<s>", right_pad_symbol="</s>"))
    log_prob = 0.0
    for ng in ngrams:
        context = ng[:-1]
        word = ng[-1]
        prob = model.score(word, context)
        # Avoid log(0) by smoothing zero probabilities
        if prob > 0:
            log_prob += math.log(prob)
        else:
            log_prob += math.log(1e-12)  # Very small smoothing value
    return log_prob


In [9]:
# Compute log probabilities as features
df['log_prob_s1_model'] = df['sentence1'].apply(lambda x: sentence_log_prob(model_s1, x))
df['log_prob_s2_model'] = df['sentence2'].apply(lambda x: sentence_log_prob(model_s2, x))


In [10]:
# Optionally compute absolute difference between scores as another feature
df['abs_log_prob_diff'] = abs(df['log_prob_s1_model'] - df['log_prob_s2_model'])

In [11]:
#normalizing the SLM features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_slm = scaler.fit_transform(df[['log_prob_s1_model', 'log_prob_s2_model', 'abs_log_prob_diff']])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

# 1. TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
vectorizer.fit(pd.concat([df['sentence1'], df['sentence2']]))

X1_vec = vectorizer.transform(df['sentence1'])
X2_vec = vectorizer.transform(df['sentence2'])

# 2. Add vector comparison features (optional but helpful)
X_diff = abs(X1_vec - X2_vec)
X_mult = X1_vec.multiply(X2_vec)

# 3. Combine sparse vectors into a single matrix
X_tfidf = hstack([X1_vec, X2_vec, X_diff, X_mult])  # shape: (n_samples, 20K or less)

# 4. Combine with SLM log-prob features (dense → sparse)
slm_features = csr_matrix(df[['log_prob_s1_model', 'log_prob_s2_model', 'abs_log_prob_diff']].values)

# 5. Final input: sparse TF-IDF + dense SLM features
X_final = hstack([X_tfidf, slm_features])


In [13]:
# Prepare features and labels for classification
X = df[['log_prob_s1_model', 'log_prob_s2_model', 'abs_log_prob_diff']]
y = df['label']

## MODEL FITTING AND EVALUATION

In [14]:
# 7. Split and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

from sklearn.svm import LinearSVC
model = LinearSVC()
model.fit(X_train, y_train)
# 8. Evaluate
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)


train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_pred)
print(f"✅ Training Accuracy: {train_acc:.4f}")
print(f"✅ Testing Accuracy: {test_acc:.4f}")

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


✅ Training Accuracy: 0.8610
✅ Testing Accuracy: 0.8332
Accuracy: 0.8332471350409669
              precision    recall  f1-score   support

           0       0.85      0.81      0.83     36837
           1       0.82      0.86      0.84     36637

    accuracy                           0.83     73474
   macro avg       0.83      0.83      0.83     73474
weighted avg       0.83      0.83      0.83     73474



In [15]:
import joblib

# Save the trained model
joblib.dump(model, 'plagiarism_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save the SLM feature scaler (if used)
joblib.dump(scaler, 'slm_scaler.pkl')

joblib.dump(model_s1, 'ngram_model_s1.pkl')
joblib.dump(model_s2, 'ngram_model_s2.pkl')


['ngram_model_s2.pkl']

## DETECTION DEMO

In [22]:
import joblib
import math
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from scipy.sparse import hstack, csr_matrix

nltk.download('punkt')

# Load all saved components
model = joblib.load('plagiarism_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
scaler = joblib.load('slm_scaler.pkl')
model_s1 = joblib.load('ngram_model_s1.pkl')
model_s2 = joblib.load('ngram_model_s2.pkl')

def sentence_log_prob(model, sentence, n=3):
    """Calculate log probability of a sentence using n-gram model"""
    tokens = word_tokenize(sentence.lower())
    ngrams = list(nltk.ngrams(tokens, n, pad_left=True, pad_right=True,
                             left_pad_symbol="<s>", right_pad_symbol="</s>"))
    log_prob = 0.0
    for ng in ngrams:
        context = ng[:-1]
        word = ng[-1]
        prob = model.score(word, context)
        if prob > 0:
            log_prob += math.log(prob)
        else:
            log_prob += math.log(1e-12)  # smoothing
    return log_prob

def predict_similarity(sentence1, sentence2):
    """Predict if two sentences are plagiarized (1) or not (0)"""
    # 1. Calculate SLM features
    log_prob1 = sentence_log_prob(model_s1, sentence1)
    log_prob2 = sentence_log_prob(model_s2, sentence2)
    abs_diff = abs(log_prob1 - log_prob2)
    
    # 2. Create TF-IDF features (same as training)
    X1_vec = vectorizer.transform([sentence1])
    X2_vec = vectorizer.transform([sentence2])
    X_diff = abs(X1_vec - X2_vec)
    X_mult = X1_vec.multiply(X2_vec)
    X_tfidf = hstack([X1_vec, X2_vec, X_diff, X_mult])
    
    # 3. Scale SLM features and combine with TF-IDF
    slm_features = scaler.transform([[log_prob1, log_prob2, abs_diff]])
    slm_features_sparse = csr_matrix(slm_features)
    X_final = hstack([X_tfidf, slm_features_sparse])
    
    # 4. Make prediction (using decision function instead of predict_proba)
    prediction = model.predict(X_final)[0]
    decision_score = model.decision_function(X_final)[0]
    
    # Convert decision score to a confidence-like measure (not actual probability)
    confidence = 1 / (1 + math.exp(-abs(decision_score)))
    
    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    print(f"Prediction: {'Plagiarized' if prediction == 1 else 'Not Plagiarized'}")
    print("-" * 50)

# Example usages
predict_similarity(
    "The quick brown fox jumps over the lazy dog.",
    "A fast brown fox leaps over a lazy dog."
)

predict_similarity(
    "Global warming is causing climate change worldwide.",
    "Climate change is primarily driven by global warming effects."
)

predict_similarity(
    "The theory of relativity was developed by Albert Einstein.",
    "Quantum mechanics deals with subatomic particles."
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentence 1: The quick brown fox jumps over the lazy dog.
Sentence 2: A fast brown fox leaps over a lazy dog.
Prediction: Plagiarized
--------------------------------------------------
Sentence 1: Global warming is causing climate change worldwide.
Sentence 2: Climate change is primarily driven by global warming effects.
Prediction: Plagiarized
--------------------------------------------------
Sentence 1: The theory of relativity was developed by Albert Einstein.
Sentence 2: Quantum mechanics deals with subatomic particles.
Prediction: Plagiarized
--------------------------------------------------


