In [52]:
import nltk
nltk.download("popular")
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
from nltk.corpus import stopwords
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load your dataset
data = pd.read_csv("dataset.csv")
data.head()

# Check label distribution
print(data['label'].value_counts())

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# Preprocess the text columns
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

# Create TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
X_source = tfidf_vectorizer.fit_transform(data["source_text"])
X_plagiarized = tfidf_vectorizer.transform(data["plagiarized_text"])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(X_source, X_plagiarized)
data['max_similarity'] = similarity_matrix.diagonal()  # Get max similarity for each pair

# Calculate originality percentage
data['originality_percentage'] = (1 - data['max_similarity']) * 100

# Ensure originality percentage is between 0 and 100
data['originality_percentage'] = data['originality_percentage'].clip(lower=0, upper=100)

# Prepare data for model training
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and fit the model
from sklearn.svm import SVC
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

# Save the model and vectorizer
pickle.dump(model, open("model.pkl", 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

# Optionally, save the updated DataFrame with originality percentages
data.to_csv("updated_dataset_with_originality.csv", index=False)


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package movie_reviews is a

label
0    187
1    183
Name: count, dtype: int64
Accuracy: 0.8783783783783784
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix:
[[31  4]
 [ 5 34]]


[nltk_data]    | Downloading package snowball_data to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package snowball_data is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | 
[nltk_data]  Done downloading collection popular


In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix")
print(cm)

Accuracy: 0.8243243243243243
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

Confusion Matrix
[[30  5]
 [ 8 31]]


In [19]:
from sklearn.ensemble import RandomForestClassifier
# Instantiate the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Generate classification report
classification_rep = classification_report(y_test, y_pred)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)


Accuracy: 0.7972972972972973
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.97      0.82        35
           1       0.96      0.64      0.77        39

    accuracy                           0.80        74
   macro avg       0.83      0.81      0.79        74
weighted avg       0.84      0.80      0.79        74

Confusion Matrix:
[[34  1]
 [14 25]]


In [20]:
from sklearn.naive_bayes import MultinomialNB
# Instantiate the model
model = MultinomialNB()
# Fit the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Generate classification report
classification_rep = classification_report(y_test, y_pred)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)


Accuracy: 0.8648648648648649
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        35
           1       0.87      0.87      0.87        39

    accuracy                           0.86        74
   macro avg       0.86      0.86      0.86        74
weighted avg       0.86      0.86      0.86        74

Confusion Matrix:
[[30  5]
 [ 5 34]]


In [21]:
from sklearn.svm import SVC

# Instantiate the model
model = SVC(kernel='linear', random_state=42)
# Fit the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
# Generate classification report
classification_rep = classification_report(y_test, y_pred)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

Accuracy: 0.8783783783783784
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix:
[[31  4]
 [ 5 34]]


In [22]:
import pickle

pickle.dump(model,open("model.pkl",'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl','wb'))

In [23]:
model = pickle.load(open('model.pkl','rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl','rb'))

In [24]:
def detect(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarim Detected" if result[0] == 1 else "No Plagiarism"

In [25]:
# example ( it is a plagarized text)
input_text = 'Researchers have discovered a new species of butterfly in the Amazon rainforest.'
detect(input_text)

'Plagiarim Detected'

In [26]:
# example ( it has no plagiarism)
input_text = 'Playing musical instruments enhances creativity.'
detect(input_text)

'No Plagiarism'

In [27]:
# example ( it has no plagarism)
input_text = 'Practicing yoga enhances physical flexibility.'
detect(input_text)

'No Plagiarism'

In [28]:
# sklearn version
import sklearn
sklearn.__version__

'1.5.1'

In [32]:
import pandas as pd
import os

# Load your dataset
csv_file = "dataset.csv"  # Replace with your CSV file name
data = pd.read_csv(csv_file)

# Ensure the column 'source_text' exists
if 'source_text' not in data.columns:
    raise ValueError("The column 'source_text' does not exist in the dataset.")

# Directory to save the documents
output_dir = "source_text_documents/"
os.makedirs(output_dir, exist_ok=True)

# Iterate through each row of 'source_text' and save it as a text file
for index, source_text in enumerate(data['source_text'], start=1):
    # Skip rows with missing or NaN values
    if pd.isna(source_text):
        continue

    # Save the content to a .txt file
    file_name = f"{output_dir}document_{index}.txt"
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(source_text)

print(f"Text files saved in the directory: {output_dir}")

Text files saved in the directory: source_text_documents/


In [40]:
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Download NLTK resources
nltk.download("popular")

# Load dataset (make sure your dataset has 'source_text', 'plagiarized_text', and 'label' columns)
data = pd.read_csv("dataset.csv")

# Preprocessing function to clean the text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing to both source_text and plagiarized_text
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Feature extraction using TF-IDF vectorizer (with source_text and plagiarized_text combined)
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])
y = data["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the model (SVC)
model = SVC(kernel='linear', random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate classification report
classification_rep = classification_report(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

# Save the model and TF-IDF vectorizer
pickle.dump(model, open("model.pkl", 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package movie_reviews is a

Accuracy: 0.8783783783783784
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix:
[[31  4]
 [ 5 34]]


[nltk_data]    | Downloading package snowball_data to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package snowball_data is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | 
[nltk_data]  Done downloading collection popular


In [50]:
import nltk
import string
import pickle
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK resources
nltk.download("popular")

# Load the saved model and TF-IDF vectorizer
model = pickle.load(open("model.pkl", 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

# Load the dataset to extract vectors for similarity calculation (source_text + plagiarized_text)
# You should load your original dataset that was used to train the model
data = pd.read_csv("dataset.csv")

# Preprocessing function to clean the text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing to both source_text and plagiarized_text (if not done already)
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

# Vectorize the entire dataset using the saved TF-IDF vectorizer (this is important for comparison)
X_data = tfidf_vectorizer.transform(data["source_text"] + " " + data["plagiarized_text"])

# Function to predict plagiarism and originality percentage
def predict_plagiarism(test_sentence):
    # Preprocess the test sentence
    test_sentence = preprocess_text(test_sentence)
    
    # Vectorize the test sentence using the saved TF-IDF vectorizer
    test_vector = tfidf_vectorizer.transform([test_sentence])
    
    # Predict the plagiarism label (0: Not Plagiarized, 1: Plagiarized)
    prediction = model.predict(test_vector)
    
    # Calculate cosine similarity between the test vector and all training data vectors
    similarities = cosine_similarity(test_vector, X_data)
    
    # Get the maximum cosine similarity score from the training data
    max_similarity = similarities.max()
    
    # Introduce a threshold for similarity to decide if it's plagiarized
    plagiarism_threshold = 0.8  # You can adjust this value to be more or less sensitive
    
    if max_similarity > plagiarism_threshold:
        prediction = 1  # Plagiarized
    else:
        prediction = 0  # Original
    
    # Calculate originality percentage as (1 - max_similarity) * 100
    originality_percentage = (1 - max_similarity) * 100
    
    return prediction, originality_percentage

# Example of test input
test_sentence = input("Water is composed of one oxygen atom. ")

# Get prediction and originality percentage
prediction, originality = predict_plagiarism(test_sentence)

# Output the result
print(f"Prediction: {'Plagiarized' if prediction == 1 else 'Original'}")
print(f"Originality Percentage: {originality:.2f}%")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package movie_reviews is a

Prediction: Original
Originality Percentage: 100.00%


In [51]:
import nltk
nltk.download("popular")
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import string
from nltk.corpus import stopwords
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load your dataset
data = pd.read_csv("dataset.csv")
data.head()

# Check label distribution
print(data['label'].value_counts())

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

# Preprocess the text columns
data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

# Create TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
X_source = tfidf_vectorizer.fit_transform(data["source_text"])
X_plagiarized = tfidf_vectorizer.transform(data["plagiarized_text"])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(X_source, X_plagiarized)
data['similarity_percentage'] = similarity_matrix.diagonal() * 100  # Convert to percentage

# Prepare data for model training
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and fit the model
from sklearn.svm import SVC
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

# Save the model and vectorizer
pickle.dump(model, open("modely.pkl", 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

# Optionally, save the updated DataFrame with similarity percentages
data.to_csv("updated_dataset_with_similarity.csv", index=False)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package movie_reviews is a

label
0    187
1    183
Name: count, dtype: int64
Accuracy: 0.8783783783783784
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        35
           1       0.89      0.87      0.88        39

    accuracy                           0.88        74
   macro avg       0.88      0.88      0.88        74
weighted avg       0.88      0.88      0.88        74

Confusion Matrix:
[[31  4]
 [ 5 34]]


[nltk_data]    | Downloading package snowball_data to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package snowball_data is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/dheeraj/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | 
[nltk_data]  Done downloading collection popular
