In [1]:
# Importing essential requirments
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score



In [2]:
# Loading Dataset (plagiarism-mit-detection)
df = pd.read_csv(r"C:\Users\user\Desktop\project\Plagiarism_Checker_AI\Dataset\train_snli.txt", sep='\t')
df.head()

Unnamed: 0,A person on a horse jumps over a broken down airplane.,"A person is at a diner, ordering an omelette.",0
0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
1,Children smiling and waving at camera,There are children present,1
2,Children smiling and waving at camera,The kids are frowning,0
3,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0
4,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1


In [3]:
# checking is there any null value
df.isnull().sum()

A person on a horse jumps over a broken down airplane.    0
A person is at a diner, ordering an omelette.             4
0                                                         0
dtype: int64

In [4]:
# Cleanning the missing values 
df.dropna(inplace=True)

In [5]:
df.shape

(367368, 3)

In [6]:
#3. renaming columns 
df.columns = ['sentence1', 'sentence2', 'label']
# 4. Prepare text and labels
X1 = df['sentence1']
X2 = df['sentence2']
y = df['label']

In [7]:
df

Unnamed: 0,sentence1,sentence2,label
0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
1,Children smiling and waving at camera,There are children present,1
2,Children smiling and waving at camera,The kids are frowning,0
3,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0
4,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1
...,...,...,...
367367,A dog with a blue collar plays ball outside.,a dog is outside,1
367368,Four dirty and barefooted children.,four children have dirty feet.,1
367369,Four dirty and barefooted children.,four kids won awards for 'cleanest feet',0
367370,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...,0


In [8]:
# !pip install nltk


In [9]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Make sure necessary NLTK packages are downloaded
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()  # lowercase once
    tokens = word_tokenize(text)  # tokenize
    tokens = [t for t in tokens if t not in string.punctuation]  # remove punctuation
    tokens = [t for t in tokens if t not in stop_words]          # remove stopwords
    tokens = [lemmatizer.lemmatize(t) for t in tokens]           # lemmatize
    return " ".join(tokens)


In [10]:
# example how this above function works
clean_text("this is what to do when dont know +##@$$ but do something on your own")


'dont know something'

In [11]:
df['sentence1'] = df['sentence1'].apply(clean_text)
df['sentence2'] = df['sentence2'].apply(clean_text)

# Now rename the columns to match your desired structure
df.columns = ['clean1', 'clean2', 'label']


In [12]:
df.head(10)

Unnamed: 0,clean1,clean2,label
0,person horse jump broken airplane,person outdoors horse,1
1,child smiling waving camera,child present,1
2,child smiling waving camera,kid frowning,0
3,boy jumping skateboard middle red bridge,boy skate sidewalk,0
4,boy jumping skateboard middle red bridge,boy skateboarding trick,1
5,older man sits orange juice small table coffee...,boy flip burger,0
6,two blond woman hugging one another,woman sleeping,0
7,two blond woman hugging one another,woman showing affection,1
8,people restaurant setting one drinking orange ...,people sitting desk school,0
9,people restaurant setting one drinking orange ...,diner restaurant,1


In [13]:
# 4. Vectorize using same TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer.fit(pd.concat([X1, X2]))

X1_vec = vectorizer.transform(X1)
X2_vec = vectorizer.transform(X2)


In [14]:
from scipy.sparse import hstack
# 5. concatenate both the vectors
# X_diff = hstack([X1_vec, X2_vec])

In [15]:
X_diff = abs(X1_vec - X2_vec)
X_mult = X1_vec.multiply(X2_vec)
X_combined = hstack([X1_vec, X2_vec, X_diff, X_mult])

In [16]:
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_diff, y, test_size=0.2, random_state=42)


In [17]:
# 7. Train the SVM
model = LinearSVC()
model.fit(X_train, y_train)

# 8. model evaluation
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_test_pred))
# Training & Testing Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"✅ Training Accuracy: {train_acc:.4f}")
print(f"✅ Testing Accuracy: {test_acc:.4f}")
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_test_pred))

Accuracy: 0.7546615129161336
✅ Training Accuracy: 0.9792
✅ Testing Accuracy: 0.7547

📊 Classification Report:

              precision    recall  f1-score   support

           0       0.77      0.73      0.75     36837
           1       0.74      0.78      0.76     36637

    accuracy                           0.75     73474
   macro avg       0.76      0.75      0.75     73474
weighted avg       0.76      0.75      0.75     73474



In [18]:
# logistic regresssion
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 8. model evaluation
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_test_pred))

# Training & Testing Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"✅ Training Accuracy: {train_acc:.4f}")
print(f"✅ Testing Accuracy: {test_acc:.4f}")
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_test_pred))

Accuracy: 0.7749135748700221
✅ Training Accuracy: 0.8775
✅ Testing Accuracy: 0.7749

📊 Classification Report:

              precision    recall  f1-score   support

           0       0.79      0.75      0.77     36837
           1       0.76      0.80      0.78     36637

    accuracy                           0.77     73474
   macro avg       0.78      0.77      0.77     73474
weighted avg       0.78      0.77      0.77     73474

