In [None]:
!pip install flaml

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [None]:
df=pd.read_csv('marathiData.csv')
df.head()

In [None]:
def getStopWords():
  with open('./stopwords-mr.txt','r') as f:
    stopwords=f.read()
    stopwords=stopwords.split('\n')
    return stopwords

stopWords=getStopWords()

stop_words = stopWords
suffixes = ['ता', 'ते', 'तो', 'ल', 'ना', 'णे', 'त', 'य']
def stem_marathi_word(word):
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

noun_suffixes = ['आणि', 'े', 'ा', 'नी', 'ची', 'मधील', 'हवे', 'ची', 'चा']
verb_suffixes = ['त', 'तो', 'ते', 'ली', 'ला', 'ले', 'णार', 'त आहे', 'त असतील']
def lemmatize_marathi(word):
    # Rule-based stripping of verb suffixes
    for suffix in verb_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]  # Stripping the suffix
    # Rule-based stripping of noun suffixes
    for suffix in noun_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def preprocess_text(text, use_stemming=False, use_lemmatization=False):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and special characters
    cleaned_text = ''.join(char for char in text if ('\u0900' <= char <= '\u097F') or char.isspace())

    # Remove stop words
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])

    # Apply stemming or lemmatization if specified
    if use_stemming:
        cleaned_text = ' '.join([stem_marathi_word(word) for word in cleaned_text.split()])
    elif use_lemmatization:
        cleaned_text = ' '.join([lemmatize_marathi(word) for word in cleaned_text.split()])

    return cleaned_text

df['translated_source'] = df['translated_source'].apply(lambda x: preprocess_text(x))
df['translated_plagiarism'] = df['translated_plagiarism'].apply(lambda x: preprocess_text(x))
df['stemmed_srcText']= df['translated_source'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))
df['stemmed_plagText']=df['translated_plagiarism'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))

In [None]:
df

In [None]:
tfidf_vectorizer256 = TfidfVectorizer(max_features=256)

In [None]:
tfidf_embeddings_source256 = tfidf_vectorizer256.fit_transform(df['stemmed_srcText'].tolist()).toarray()
tfidf_embeddings_plag256=tfidf_vectorizer256.fit_transform(df['stemmed_plagText'].tolist()).toarray()

tfidf_embeddings256 = tfidf_embeddings_source256 - tfidf_embeddings_plag256

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train256, X_test256, y_train256, y_test256 = train_test_split(
    tfidf_embeddings256, df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

scaler = StandardScaler()

X_train256 = scaler.fit_transform(X_train256)
X_test256 = scaler.transform(X_test256)

# **XGBOOST**

In [None]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Initialize the FLAML AutoML instance
automl = AutoML()

# Set the training parameters
automl_settings = {
    "time_budget": 1000,  # Total running time in seconds
    "metric": 'accuracy',  # Evaluation metric
    "task": 'classification',  # Task type
    "n_jobs": -1,
    "estimator_list": ["xgboost"],
    "early_stop": True

}

# Fit the FLAML AutoML instance on the training data
automl.fit(X_train256, y_train256, **automl_settings)

# Display the best model found
print("Best model:", automl.best_estimator)

# Predictions on the test set
y_test_pred = automl.predict(X_test256)

# Calculate metrics for the test set
metrics_test = {
    "Accuracy": accuracy_score(y_test256, y_test_pred) * 100,
    "Precision": precision_score(y_test256, y_test_pred) * 100,
    "Recall": recall_score(y_test256, y_test_pred) * 100,
    "F1 Score": f1_score(y_test256, y_test_pred) * 100,
    "Confusion Matrix": confusion_matrix(y_test256, y_test_pred)
}

# Display test metrics
print("\nTest Set Metrics:")
for metric, value in metrics_test.items():
    if metric != "Confusion Matrix":
        print(f"{metric}: {value:.2f}%")
    else:
        print(f"{metric}:\n{value}")


# **LGBM**

In [None]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Initialize the FLAML AutoML instance
automl = AutoML()

# Set the training parameters
automl_settings = {
    "time_budget": 1000,  # Total running time in seconds
    "metric": 'accuracy',  # Evaluation metric
    "task": 'classification',  # Task type
    "n_jobs": -1,
    "estimator_list": ["lgbm"],
    "early_stop": True

}

# Fit the FLAML AutoML instance on the training data
automl.fit(X_train256, y_train256, **automl_settings)

# Display the best model found
print("Best model:", automl.best_estimator)

# Predictions on the test set
y_test_pred = automl.predict(X_test256)

# Calculate metrics for the test set
metrics_test = {
    "Accuracy": accuracy_score(y_test256, y_test_pred) * 100,
    "Precision": precision_score(y_test256, y_test_pred) * 100,
    "Recall": recall_score(y_test256, y_test_pred) * 100,
    "F1 Score": f1_score(y_test256, y_test_pred) * 100,
    "Confusion Matrix": confusion_matrix(y_test256, y_test_pred)
}

# Display test metrics
print("\nTest Set Metrics:")
for metric, value in metrics_test.items():
    if metric != "Confusion Matrix":
        print(f"{metric}: {value:.2f}%")
    else:
        print(f"{metric}:\n{value}")


# **Random Forest**

In [None]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

automl = AutoML()

automl_settings = {
    "time_budget": 1000,  # Total running time in seconds
    "metric": 'accuracy',  # Evaluation metric
    "task": 'classification',  # Task type
    "n_jobs": -1,
    "estimator_list": ["rf"],
    "early_stop": True

}

# Fit the FLAML AutoML instance on the training data
automl.fit(X_train256, y_train256, **automl_settings)

# Display the best model found
print("Best model:", automl.best_estimator)

# Predictions on the test set
y_test_pred = automl.predict(X_test256)

# Calculate metrics for the test set
metrics_test = {
    "Accuracy": accuracy_score(y_test256, y_test_pred) * 100,
    "Precision": precision_score(y_test256, y_test_pred) * 100,
    "Recall": recall_score(y_test256, y_test_pred) * 100,
    "F1 Score": f1_score(y_test256, y_test_pred) * 100,
    "Confusion Matrix": confusion_matrix(y_test256, y_test_pred)
}

# Display test metrics
print("\nTest Set Metrics:")
for metric, value in metrics_test.items():
    if metric != "Confusion Matrix":
        print(f"{metric}: {value:.2f}%")
    else:
        print(f"{metric}:\n{value}")


# **Logistic Regression lrl2**

In [None]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

automl = AutoML()

automl_settings = {
    "time_budget": 1000,  # Total running time in seconds
    "metric": 'accuracy',  # Evaluation metric
    "task": 'classification',  # Task type
    "n_jobs": -1,
    "estimator_list": ["lrl2"],
    "early_stop": True

}

# Fit the FLAML AutoML instance on the training data
automl.fit(X_train256, y_train256, **automl_settings)

# Display the best model found
print("Best model:", automl.best_estimator)

# Predictions on the test set
y_test_pred = automl.predict(X_test256)

# Calculate metrics for the test set
metrics_test = {
    "Accuracy": accuracy_score(y_test256, y_test_pred) * 100,
    "Precision": precision_score(y_test256, y_test_pred) * 100,
    "Recall": recall_score(y_test256, y_test_pred) * 100,
    "F1 Score": f1_score(y_test256, y_test_pred) * 100,
    "Confusion Matrix": confusion_matrix(y_test256, y_test_pred)
}

# Display test metrics
print("\nTest Set Metrics:")
for metric, value in metrics_test.items():
    if metric != "Confusion Matrix":
        print(f"{metric}: {value:.2f}%")
    else:
        print(f"{metric}:\n{value}")


# **SVC**

In [None]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

automl = AutoML()

automl_settings = {
    "time_budget": 1000,  # Total running time in seconds
    "metric": 'accuracy',  # Evaluation metric
    "task": 'classification',  # Task type
    "n_jobs": -1,
    "estimator_list": ["svc"],
    "early_stop": True

}

# Fit the FLAML AutoML instance on the training data
automl.fit(X_train256, y_train256, **automl_settings)

# Display the best model found
print("Best model:", automl.best_estimator)

# Predictions on the test set
y_test_pred = automl.predict(X_test256)

# Calculate metrics for the test set
metrics_test = {
    "Accuracy": accuracy_score(y_test256, y_test_pred) * 100,
    "Precision": precision_score(y_test256, y_test_pred) * 100,
    "Recall": recall_score(y_test256, y_test_pred) * 100,
    "F1 Score": f1_score(y_test256, y_test_pred) * 100,
    "Confusion Matrix": confusion_matrix(y_test256, y_test_pred)
}

# Display test metrics
print("\nTest Set Metrics:")
for metric, value in metrics_test.items():
    if metric != "Confusion Matrix":
        print(f"{metric}: {value:.2f}%")
    else:
        print(f"{metric}:\n{value}")


# **KNN**

In [None]:
from flaml import AutoML
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

automl = AutoML()

automl_settings = {
    "time_budget": 1000,  # Total running time in seconds
    "metric": 'accuracy',  # Evaluation metric
    "task": 'classification',  # Task type
    "n_jobs": -1,
    "estimator_list": ["kneighbor"],
    "early_stop": True

}

# Fit the FLAML AutoML instance on the training data
automl.fit(X_train256, y_train256, **automl_settings)

# Display the best model found
print("Best model:", automl.best_estimator)

# Predictions on the test set
y_test_pred = automl.predict(X_test256)

# Calculate metrics for the test set
metrics_test = {
    "Accuracy": accuracy_score(y_test256, y_test_pred) * 100,
    "Precision": precision_score(y_test256, y_test_pred) * 100,
    "Recall": recall_score(y_test256, y_test_pred) * 100,
    "F1 Score": f1_score(y_test256, y_test_pred) * 100,
    "Confusion Matrix": confusion_matrix(y_test256, y_test_pred)
}

# Display test metrics
print("\nTest Set Metrics:")
for metric, value in metrics_test.items():
    if metric != "Confusion Matrix":
        print(f"{metric}: {value:.2f}%")
    else:
        print(f"{metric}:\n{value}")


# **Naive Bayes**

In [None]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = GaussianNB()

param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

grid_search.fit(X_train256, y_train256)

best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

y_pred = best_model.predict(X_test256)

accuracy = accuracy_score(y_test256, y_pred)
precision = precision_score(y_test256, y_pred)
recall = recall_score(y_test256, y_pred)
f1 = f1_score(y_test256, y_pred)
conf_matrix = confusion_matrix(y_test256, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

# **Decision Tree**

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

model = DecisionTreeClassifier()

# Set up the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Configure Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit the model using your data
grid_search.fit(X_train256, y_train256)

# Get the best model
best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

# Predict on the test set
y_pred = best_model.predict(X_test256)

# Evaluate the model
accuracy = accuracy_score(y_test256, y_pred)
precision = precision_score(y_test256, y_pred)
recall = recall_score(y_test256, y_pred)
f1 = f1_score(y_test256, y_pred)
conf_matrix = confusion_matrix(y_test256, y_pred)

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

# **AdaBoost**

In [None]:
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Define the model
model = AdaBoostClassifier()

# Set up the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0, 10],
    'estimator': [DecisionTreeClassifier(max_depth=1), None]  # Decision stump or default estimator
}

# Configure Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit the model using your data
grid_search.fit(X_train256, y_train256)

# Get the best model
best_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

# Predict on the test set
y_pred = best_model.predict(X_test256)

# Evaluate the model
accuracy = accuracy_score(y_test256, y_pred)
precision = precision_score(y_test256, y_pred)
recall = recall_score(y_test256, y_pred)
f1 = f1_score(y_test256, y_pred)
conf_matrix = confusion_matrix(y_test256, y_pred)

# Print the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)