In [None]:
# Import necessary modules
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import nltk

# Download NLTK data
nltk.download("popular")

# Load Dataset
data = pd.read_csv("/content/dataset.csv")

# Clean Text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

data["source_text"] = data["source_text"].apply(preprocess_text)
data["plagiarized_text"] = data["plagiarized_text"].apply(preprocess_text)

# Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X = tfidf_vectorizer.fit_transform(data["source_text"] + " " + data["plagiarized_text"])
y = data["label"]

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
log_reg = LogisticRegression(solver='liblinear')  # Optimized solver for better performance
rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)  # Tuned parameters
nb = MultinomialNB()

# Hyperparameter Tuning for Random Forest (Optional, already tuned above)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
}
grid_search_rf = GridSearchCV(rf, param_grid, cv=3)
grid_search_rf.fit(X_train, y_train)
rf_best = grid_search_rf.best_estimator_

# Ensemble Voting Classifier (Soft Voting for probability-based decision)
ensemble_model = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('rf_best', rf_best),
    ('nb', nb)
], voting='soft')  # 'soft' gives better balance in predictions

# Fit the ensemble model
ensemble_model.fit(X_train, y_train)

# Make predictions
y_pred = ensemble_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Print results
print("Optimized Ensemble Model Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(cm)

# Cross-Validation to assess generalization performance
cv_scores = cross_val_score(ensemble_model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

# Save the optimized ensemble model and TF-IDF vectorizer
pickle.dump(ensemble_model, open("optimized_ensemble_model.pkl", 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))

# Load Model and Vectorizer
model = pickle.load(open('optimized_ensemble_model.pkl', 'rb'))
tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

In [None]:
def detect(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarism Detected" if result[0] == 1 else "No Plagiarism"

# Test with new text
input_text = 'My name is sohel darwajkar'
print(detect(input_text))

input_text = 'Look deeper into the misclassifications (false positives and false negatives) to identify patterns and improve preprocessing or model design accordingly.'
print(detect(input_text))

input_text = 'I study in viit'
print(detect(input_text))
# Test with plagiarized text
input_text = "The solar system is made up of the Sun and every celestial object that is held in orbit by its gravitational pull."
print(detect(input_text))  # Should indicate plagiarism detected

input_text = "Green plants utilize sunlight to produce food through the process of photosynthesis, aided by chlorophyll."
print(detect(input_text))  # Should indicate plagiarism detected
