In [None]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df=pd.read_csv('marathiData.csv')
df.head()

In [None]:
def getStopWords():
  with open('./stopwords-mr.txt','r') as f:
    stopwords=f.read()
    stopwords=stopwords.split('\n')
    return stopwords

stopWords=getStopWords()

stop_words = stopWords
suffixes = ['ता', 'ते', 'तो', 'ल', 'ना', 'णे', 'त', 'य']
def stem_marathi_word(word):
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

noun_suffixes = ['आणि', 'े', 'ा', 'नी', 'ची', 'मधील', 'हवे', 'ची', 'चा']
verb_suffixes = ['त', 'तो', 'ते', 'ली', 'ला', 'ले', 'णार', 'त आहे', 'त असतील']
def lemmatize_marathi(word):
    # Rule-based stripping of verb suffixes
    for suffix in verb_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]  # Stripping the suffix
    # Rule-based stripping of noun suffixes
    for suffix in noun_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def preprocess_text(text, use_stemming=False, use_lemmatization=False):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and special characters
    cleaned_text = ''.join(char for char in text if ('\u0900' <= char <= '\u097F') or char.isspace())

    # Remove stop words
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])

    # Apply stemming or lemmatization if specified
    if use_stemming:
        cleaned_text = ' '.join([stem_marathi_word(word) for word in cleaned_text.split()])
    elif use_lemmatization:
        cleaned_text = ' '.join([lemmatize_marathi(word) for word in cleaned_text.split()])

    return cleaned_text

df['translated_source'] = df['translated_source'].apply(lambda x: preprocess_text(x))
df['translated_plagiarism'] = df['translated_plagiarism'].apply(lambda x: preprocess_text(x))
df['stemmed_srcText']= df['translated_source'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))
df['stemmed_plagText']=df['translated_plagiarism'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))


In [None]:
df

In [None]:
with open("bertEmbeddings.pkl","rb") as f:
    bertEmbeddings=pickle.load(f)

bertEmbeddings.shape

In [None]:
tfidf_vectorizer400 = TfidfVectorizer(max_features=400)

In [None]:
tfidf_embeddings_source400 = tfidf_vectorizer400.fit_transform(df['stemmed_srcText'].tolist()).toarray()
tfidf_embeddings_plag400=tfidf_vectorizer400.fit_transform(df['stemmed_plagText'].tolist()).toarray()

tfidf_embeddings400 = tfidf_embeddings_source400 - tfidf_embeddings_plag400

In [None]:
def getTrainTestSplit(bertEmbeddings, tfidf_embeddings, labels):
    # Splitting the bertEmbeddings and labels into train/test
    bertEmbeddingsTrain = bertEmbeddings[:int(len(bertEmbeddings) * 0.8)]
    bertEmbeddingsTest = bertEmbeddings[int(len(bertEmbeddings) * 0.8):]
    y_train = labels[:int(len(labels) * 0.8)]
    y_test = labels[int(len(labels) * 0.8):]

    # If there are no tfidf_embeddings, return only bert embeddings
    if len(tfidf_embeddings) == 0:
        return bertEmbeddingsTrain, bertEmbeddingsTest, y_train, y_test

    # Splitting tfidf_embeddings into train/test
    tfidfEmbeddingsTrain = tfidf_embeddings[:int(len(tfidf_embeddings) * 0.8)]
    tfidfEmbeddingsTest = tfidf_embeddings[int(len(tfidf_embeddings) * 0.8):]

    # Concatenate the bert and tfidf embeddings for train and test
    X_train = np.concatenate([bertEmbeddingsTrain, tfidfEmbeddingsTrain], axis=1)
    X_test = np.concatenate([bertEmbeddingsTest, tfidfEmbeddingsTest], axis=1)

    return X_train, X_test, y_train, y_test, bertEmbeddingsTrain, bertEmbeddingsTest, tfidfEmbeddingsTrain, tfidfEmbeddingsTest

In [None]:
_,_,y_train, y_test,bertEmbeddingsTrain,bertEmbeddingsTest, tfidfEmbeddingsTrain, tfidfEmbeddingsTest=getTrainTestSplit(bertEmbeddings,tfidf_embeddings400,df['label'].to_list())

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

tfidf_classifiers = [
    LogisticRegression(C=0.13626934782420866, n_jobs=-1),
    LGBMClassifier(colsample_bytree=0.928627577871539,
               learning_rate=0.18466649352827333, max_bin=15,
               min_child_samples=12, n_estimators=1, n_jobs=-1, num_leaves=8,
               reg_alpha=0.0019393893820794101, reg_lambda=0.15928832087494818,
               verbose=-1),
]

bert_classifiers = [
    XGBClassifier(base_score=None, booster=None, callbacks=[],
              colsample_bylevel=0.1978085011113737, colsample_bynode=None,
              colsample_bytree=0.4437517454987611, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=None,
              grow_policy='lossguide', importance_type=None,
              interaction_constraints=None, learning_rate=0.16506372545399872,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=0, max_leaves=20,
              min_child_weight=0.26976026917736995,
              monotone_constraints=None, multi_strategy=None, n_estimators=371,
              n_jobs=-1, num_parallel_tree=None, random_state=None),

    SVC(
        probability=True,
        kernel='rbf',
        C=100,
        degree=2,
        gamma='scale'
        ),
]

In [None]:
for clf in tfidf_classifiers:
    clf.fit(tfidfEmbeddingsTrain, y_train)
for clf in bert_classifiers:
    clf.fit(bertEmbeddingsTrain, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_weight_doublets(classifiers, embeddings, y_test):
    doublets = []

    for x in [round(i * 0.1, 1) for i in range(11)]:
        y = round(1 - x, 1)
        if 0 <= y <= 1:
            doublet = (x, y)
            doublets.append(doublet)

    best_accuracy = -1
    best_precision = 0
    best_recall = 0
    best_f1 = 0
    best_weights = None

    proba = [classifier.predict_proba(embeddings)[:, 1] for classifier in classifiers]

    for weights in doublets:
        temp = weights[0] * proba[0] + weights[1] * proba[1]
        y_pred = (temp >= 0.5).astype(int)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_precision = precision
            best_recall = recall
            best_f1 = f1
            best_weights = weights

    print(f"\nBest Accuracy: {best_accuracy:.4f}, Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F1 Score: {best_f1:.4f}, Best Weights: {best_weights}")
    return best_weights

def evaluate_weight_triplets(classifiers, embeddings, y_test):
    triplets = []

    for x in [round(i * 0.1, 1) for i in range(11)]:
        for y in [round(i * 0.1, 1) for i in range(int((1 - x) * 10) + 1)]:
            z = round(1 - x - y, 1)
            if 0 <= z <= 1:
                triplet = (x, y, z)
                triplets.append(triplet)

    best_accuracy = -1
    best_precision = 0
    best_recall = 0
    best_f1 = 0
    best_weights = None

    proba = [classifier.predict_proba(embeddings)[:, 1] for classifier in classifiers]

    for weights in triplets:
        temp = weights[0] * proba[0] + weights[1] * proba[1] + weights[2] * proba[2]
        y_pred = (temp >= 0.5).astype(int)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')
        recall = recall_score(y_test, y_pred, average='binary')
        f1 = f1_score(y_test, y_pred, average='binary')

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_precision = precision
            best_recall = recall
            best_f1 = f1
            best_weights = weights

    print(f"\nBest Accuracy: {best_accuracy:.4f}, Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F1 Score: {best_f1:.4f}, Best Weights: {best_weights}")
    return best_weights


In [None]:
best_weights_for_tfidf_models=evaluate_weight_doublets(tfidf_classifiers,tfidfEmbeddingsTest,y_test)

In [None]:
best_weights_for_bert_models=evaluate_weight_doublets(bert_classifiers,bertEmbeddingsTest,y_test)

In [None]:
tfidf_predictions = np.array([
    weight * clf.predict_proba(tfidfEmbeddingsTest)[:, 1]
    for clf, weight in zip(tfidf_classifiers, best_weights_for_tfidf_models)
]).sum(axis=0)

bert_predictions = np.array([
    weight * clf.predict_proba(bertEmbeddingsTest)[:, 1]
    for clf, weight in zip(bert_classifiers, best_weights_for_bert_models)
]).sum(axis=0)

tfidf_predictions = tfidf_predictions.reshape(-1, 1)
bert_predictions = bert_predictions.reshape(-1, 1)

In [None]:
def weighted_ensemble(tfidf_preds, bert_preds, w_tfidf, w_bert):
    return (w_tfidf * tfidf_preds +
            w_bert * bert_preds)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, roc_auc_score)

# Initialize an empty list to store results
results = []

# Iterate over weight combinations
for w_tfidf in np.linspace(0, 1, 11):
    w_bert = 1 - w_tfidf
    # Generate combined predictions using the weighted ensemble
    combined_predictions = weighted_ensemble(tfidf_predictions, bert_predictions, w_tfidf, w_bert)

    # Convert probabilities to binary predictions (0 or 1) based on a 0.5 threshold
    binary_predictions = (combined_predictions >= 0.5).astype(int)

    # Calculate accuracy, precision, recall, F1 score, and AUC score
    accuracy = accuracy_score(y_test, binary_predictions)
    precision = precision_score(y_test, binary_predictions)
    recall = recall_score(y_test, binary_predictions)
    f1 = f1_score(y_test, binary_predictions)
    auc_score = roc_auc_score(y_test, combined_predictions)

    # Normalize the confusion matrix
    cm = confusion_matrix(y_test, binary_predictions, normalize='true')

    # Store metrics in the results list
    results.append({
        'Weight TF-IDF': w_tfidf,
        'Weight BERT': w_bert,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC Score': auc_score
    })

    # Print metrics for current weight combination
    print(f"w_tfidf: {w_tfidf:.3f}, w_bert: {w_bert:.3f}, "
          f"Accuracy: {accuracy * 100:.3f}%, "
          f"Precision: {precision * 100:.3f}%, "
          f"Recall: {recall * 100:.3f}%, "
          f"F1 Score: {f1 * 100:.3f}%, "
          f"AUC Score: {auc_score * 100:.3f}%")
    print(f"Normalized Confusion Matrix:\n{cm}\n")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Convert metrics to percentage format with 2 decimal places
results_df['Accuracy'] = (results_df['Accuracy'] * 100).round(2).astype(str) + '%'
results_df['Precision'] = (results_df['Precision'] * 100).round(2).astype(str) + '%'
results_df['Recall'] = (results_df['Recall'] * 100).round(2).astype(str) + '%'
results_df['F1 Score'] = (results_df['F1 Score'] * 100).round(2).astype(str) + '%'
results_df['AUC Score'] = (results_df['AUC Score'] * 100).round(2).astype(str) + '%'

# Find the best result based on accuracy
best_result = max(results, key=lambda x: x['Accuracy'])
best_w_tfidf = best_result['Weight TF-IDF']
best_w_bert = best_result['Weight BERT']
best_accuracy = best_result['Accuracy']
best_precision = best_result['Precision']
best_recall = best_result['Recall']
best_f1 = best_result['F1 Score']
best_auc_score = best_result['AUC Score']

# Print best metrics
print("\nBest Result:")
print(f"Best Weights -> TF-IDF: {best_w_tfidf:.3f}, BERT: {best_w_bert:.3f}")
print(f"Best Accuracy: {best_accuracy}")
print(f"Best Precision: {best_precision}")
print(f"Best Recall: {best_recall}")
print(f"Best F1 Score: {best_f1}")
print(f"Best AUC Score: {best_auc_score}")

# Save the DataFrame to CSV
results_df.to_csv('ensemble_results.csv', index=False)