In [None]:
import pandas as pd

df = pd.read_csv('test.csv')
df.shape

In [None]:
df.drop(columns=['#1 ID', '#2 ID'], inplace=True)

In [None]:
df.rename(columns={
    'Quality': 'label',
    '#1 String': 'source_txt',
    '#2 String': 'plagarism_txt'
}, inplace=True)

In [None]:
df['source_txt'] = df['source_txt'].astype(str)
df['plagarism_txt'] = df['plagarism_txt'].astype(str)

In [None]:
df.head()

In [None]:
df = df[['source_txt', 'plagarism_txt', 'label']]

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from tqdm import tqdm
import torch

model_checkpoint = "aryaumesh/english-to-marathi"
tokenizer = MBart50TokenizerFast.from_pretrained(model_checkpoint)
model = MBartForConditionalGeneration.from_pretrained(model_checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate_en_to_mr(text: str) -> str:
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

tqdm.pandas()

df['source_txt'] = df['source_txt'].progress_apply(translate_en_to_mr)
df['plagarism_txt'] = df['plagarism_txt'].progress_apply(translate_en_to_mr)

In [None]:
def getStopWords():
  with open('./stopwords-mr.txt','r') as f:
    stopwords=f.read()
    stopwords=stopwords.split('\n')
    return stopwords

stopWords=getStopWords()

stop_words = stopWords
suffixes = ['ता', 'ते', 'तो', 'ल', 'ना', 'णे', 'त', 'य']
def stem_marathi_word(word):
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

noun_suffixes = ['आणि', 'े', 'ा', 'नी', 'ची', 'मधील', 'हवे', 'ची', 'चा']
verb_suffixes = ['त', 'तो', 'ते', 'ली', 'ला', 'ले', 'णार', 'त आहे', 'त असतील']
def lemmatize_marathi(word):
    for suffix in verb_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]

    for suffix in noun_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def preprocess_text(text, use_stemming=False, use_lemmatization=False):

    text = text.lower()


    cleaned_text = ''.join(char for char in text if ('\u0900' <= char <= '\u097F') or char.isspace())


    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])


    if use_stemming:
        cleaned_text = ' '.join([stem_marathi_word(word) for word in cleaned_text.split()])
    elif use_lemmatization:
        cleaned_text = ' '.join([lemmatize_marathi(word) for word in cleaned_text.split()])

    return cleaned_text

df['translated_source'] = df['source_txt'].apply(lambda x: preprocess_text(x))
df['translated_plagiarism'] = df['plagarism_txt'].apply(lambda x: preprocess_text(x))
df['stemmed_srcText']= df['translated_source'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))
df['stemmed_plagText']=df['translated_plagiarism'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))

In [None]:
from sentence_transformers import SentenceTransformer

model_name = 'l3cube-pune/marathi-sentence-similarity-sbert'
model = SentenceTransformer(model_name)

def get_bert_embeddings():
    return model.encode(df['translated_source'].tolist())-model.encode(df['translated_plagiarism'].tolist())

bertEmbeddings=get_bert_embeddings()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf_embeddings():
    tfidf_vectorizer400 = TfidfVectorizer(max_features=400)
    return tfidf_vectorizer400.fit_transform(df['stemmed_srcText'].tolist()).toarray()-tfidf_vectorizer400.fit_transform(df['stemmed_plagText'].tolist()).toarray()

tfidf_embeddings400 = get_tfidf_embeddings()

# **Proposed Ensemble Model**

In [None]:
import pickle

with open('/content/best_tfidf400_classifiers.pkl', 'rb') as tfidf_file:
    tfidf_classifiers = pickle.load(tfidf_file)

with open('/content/best_bert768_classifiers.pkl', 'rb') as bert_file:
    bert_classifiers = pickle.load(bert_file)

In [None]:
import numpy as np

tfidf_weights = [0.1, 0.9]
bert_weights = [0.7, 0.3]

tfidf_predictions = np.array([
    weight * clf.predict_proba(tfidf_embeddings400)[:, 1]
    for clf, weight in zip(tfidf_classifiers, tfidf_weights)
]).sum(axis=0)

bert_predictions = np.array([
    weight * clf.predict_proba(bertEmbeddings)[:, 1]
    for clf, weight in zip(bert_classifiers, bert_weights)
]).sum(axis=0)

tfidf_predictions = tfidf_predictions.reshape(-1, 1)
bert_predictions = bert_predictions.reshape(-1, 1)

In [None]:
def weighted_ensemble(tfidf_preds, bert_preds):
    return (0.4 * tfidf_preds +
            0.6 * bert_preds)

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_auc_score)

combined_predictions = weighted_ensemble(tfidf_predictions, bert_predictions)

binary_predictions = (combined_predictions >= 0.5).astype(int)

accuracy = accuracy_score(df["label"], binary_predictions)
precision = precision_score(df["label"], binary_predictions)
recall = recall_score(df["label"], binary_predictions)
f1 = f1_score(df["label"], binary_predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# **XGboost with BERT-768**

In [None]:
!pip install flaml

In [None]:
import pickle

with open('/content/xgboost_bert768.pkl', 'rb') as f:
    bert768_xgboost_model=pickle.load(f)

    y_pred = bert768_xgboost_model.predict(bertEmbeddings)

    accuracy = accuracy_score(df["label"], y_pred) * 100
    precision = precision_score(df["label"], y_pred, average='binary') * 100
    recall = recall_score(df["label"], y_pred, average='binary') * 100
    f1 = f1_score(df["label"], y_pred, average='binary') * 100

    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Precision: {precision:.2f}%")
    print(f"Recall: {recall:.2f}%")
    print(f"F1 Score: {f1:.2f}%")


# **Random Forest with BERT-768**

In [None]:
from joblib import load


rf_bert768_model = load('/content/rf_bert768.pkl')

y_pred = rf_bert768_model.predict(bertEmbeddings)

accuracy = accuracy_score(df["label"], y_pred) * 100
precision = precision_score(df["label"], y_pred, average='binary') * 100
recall = recall_score(df["label"], y_pred, average='binary') * 100
f1 = f1_score(df["label"], y_pred, average='binary') * 100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1 Score: {f1:.2f}%")


# **LIGHTBGM with BERT-768**

In [None]:
from joblib import load


lgbm_bert768_model = load('/content/lgbm_bert768.pkl')

y_pred = lgbm_bert768_model.predict(bertEmbeddings)

accuracy = accuracy_score(df["label"], y_pred) * 100
precision = precision_score(df["label"], y_pred, average='binary') * 100
recall = recall_score(df["label"], y_pred, average='binary') * 100
f1 = f1_score(df["label"], y_pred, average='binary') * 100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1 Score: {f1:.2f}%")