In [None]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df=pd.read_csv('marathiData.csv')
df.head()

Unnamed: 0,label,translated_source,translated_plagiarism
0,0,घोड्यावर असलेला माणूस तुटलेल्या विमानावर उडी म...,"एखादी व्यक्ती डायनरमध्ये आहे, ऑमलेट मागवत आहे."
1,1,घोड्यावर असलेला माणूस तुटलेल्या विमानावर उडी म...,"एखादी व्यक्ती घराबाहेर, घोड्यावर असते."
2,1,मुलांना हसताना आणि कॅमेऱ्यावर हात हलवताना,मुले उपस्थित आहेत.
3,0,मुलांना हसताना आणि कॅमेऱ्यावर हात हलवताना,मुले भुरळत आहेत.
4,0,लाल पुलाच्या मधोमध एक मुलगा स्केटबोर्डवर उडी म...,मुलगा फूटपाथवर स्केट करतो.


In [None]:
def getStopWords():
  with open('./stopwords-mr.txt','r') as f:
    stopwords=f.read()
    stopwords=stopwords.split('\n')
    return stopwords

stopWords=getStopWords()

stop_words = stopWords
suffixes = ['ता', 'ते', 'तो', 'ल', 'ना', 'णे', 'त', 'य']
def stem_marathi_word(word):
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

noun_suffixes = ['आणि', 'े', 'ा', 'नी', 'ची', 'मधील', 'हवे', 'ची', 'चा']
verb_suffixes = ['त', 'तो', 'ते', 'ली', 'ला', 'ले', 'णार', 'त आहे', 'त असतील']
def lemmatize_marathi(word):
    # Rule-based stripping of verb suffixes
    for suffix in verb_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]  # Stripping the suffix
    # Rule-based stripping of noun suffixes
    for suffix in noun_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

def preprocess_text(text, use_stemming=False, use_lemmatization=False):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers and special characters
    cleaned_text = ''.join(char for char in text if ('\u0900' <= char <= '\u097F') or char.isspace())

    # Remove stop words
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])

    # Apply stemming or lemmatization if specified
    if use_stemming:
        cleaned_text = ' '.join([stem_marathi_word(word) for word in cleaned_text.split()])
    elif use_lemmatization:
        cleaned_text = ' '.join([lemmatize_marathi(word) for word in cleaned_text.split()])

    return cleaned_text

df['translated_source'] = df['translated_source'].apply(lambda x: preprocess_text(x))
df['translated_plagiarism'] = df['translated_plagiarism'].apply(lambda x: preprocess_text(x))
df['stemmed_srcText']= df['translated_source'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))
df['stemmed_plagText']=df['translated_plagiarism'].apply(lambda x: preprocess_text(x,use_stemming=True, use_lemmatization=True))

In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from fuzzywuzzy import fuzz
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import cosine
import nltk
from difflib import SequenceMatcher
from gensim.models import KeyedVectors

# Ensure necessary downloads
nltk.download('punkt')
import requests

# URL for the FastText Marathi embedding file
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz"

# Local file name to save the downloaded file
output_file = "cc.mr.300.vec.gz"

# Downloading the file
print("Downloading FastText embeddings...")
response = requests.get(url, stream=True)

# Check if the download was successful
if response.status_code == 200:
    with open(output_file, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"Download completed! File saved as {output_file}")
else:
    print(f"Failed to download file. HTTP Status Code: {response.status_code}")
import gzip
import shutil

with gzip.open("cc.mr.300.vec.gz", "rb") as f_in:
    with open("cc.mr.300.vec", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
print("Decompression completed! File saved as cc.mr.300.vec")

embedding_model = KeyedVectors.load_word2vec_format("cc.mr.300.vec", binary=False)




In [None]:
import spacy
# Compute word-level semantic similarity using embeddings
def embedding_similarity(word1, word2):
    if word1 in embedding_model and word2 in embedding_model:
        vector1 = embedding_model[word1]
        vector2 = embedding_model[word2]
        similarity = 1 - cosine(vector1, vector2)  # Cosine similarity
        return similarity
    else:
        return 0.0  # Handle out-of-vocabulary words

# Aggregate word similarity for text
def text_embedding_similarity(text1, text2):
    words1 = nltk.word_tokenize(text1)
    words2 = nltk.word_tokenize(text2)
    total_similarity = 0.0
    count = 0
    for word1 in words1:
        for word2 in words2:
            total_similarity += embedding_similarity(word1, word2)
            count += 1
    return total_similarity / (count + 1e-9)  # Average similarity

# Compute n-gram overlap
def ngram_overlap(text1, text2, n=2):
    ngrams1 = set([" ".join(ngram) for ngram in nltk.ngrams(nltk.word_tokenize(text1), n)])
    ngrams2 = set([" ".join(ngram) for ngram in nltk.ngrams(nltk.word_tokenize(text2), n)])
    overlap = len(ngrams1.intersection(ngrams2))
    return overlap / (len(ngrams1.union(ngrams2)) + 1e-9)

# Compute Levenshtein distance
def levenshtein_distance(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()

# Compute Jaccard similarity
def jaccard_similarity(text1, text2):
    nlp = spacy.blank("mr")
    def spacy_tokenizer(text):
        return [token.text for token in nlp(text)]

    vectorizer = CountVectorizer(binary=True, tokenizer=spacy_tokenizer,token_pattern=None)
    text_vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return jaccard_score(text_vectors[0], text_vectors[1])

# Compute fuzzy string similarity
def fuzzy_similarity(text1, text2):
    return fuzz.ratio(text1, text2) / 100.0

# Compute cosine similarity (on bag-of-words representation)
def cosine_similarity(text1, text2):
    nlp = spacy.blank("mr")
    def spacy_tokenizer(text):
        return [token.text for token in nlp(text)]

    vectorizer = CountVectorizer(binary=True, tokenizer=spacy_tokenizer,token_pattern=None)
    text_vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return 1 - cosine(text_vectors[0], text_vectors[1])

# Feature extraction for a pair of texts
def compute_features(text1, text2):
    features = [
        text_embedding_similarity(text1, text2),  # Word embedding similarity
        ngram_overlap(text1, text2),              # N-gram overlap
        levenshtein_distance(text1, text2),       # Levenshtein distance
        fuzzy_similarity(text1, text2),           # Fuzzy string similarity
        jaccard_similarity(text1, text2),         # Jaccard similarity
        cosine_similarity(text1, text2)           # Cosine similarity
    ]
    return np.array(features)


In [None]:
stemmedSrc=df['stemmed_srcText'].to_list()
stemmedPlag=df['stemmed_plagText'].to_list()
text_pairs=[(stemmedSrc[i],stemmedPlag[i]) for i in range(len(stemmedSrc))]
text_pairs[:5]

feature_matrix = np.array([compute_features(pair[0], pair[1]) for pair in text_pairs])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming `feature_matrix` and `labels` are your feature and label arrays
labels=df['label'].to_list()
# Split data into training and testing sets using train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42)

# Train the classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(train_features, train_labels)

# Predict and evaluate
predictions = classifier.predict(test_features)
probabilities = classifier.predict_proba(test_features)[:, 1]
# print(f"Predictions: {predictions}")
print(f"Accuracy: {accuracy_score(test_labels, predictions)}")
# Print classification report (Precision, Recall, F1-score)
print(f"Classification Report:\n{classification_report(test_labels, predictions)}")

# Print AUC-ROC score
print(f"AUC-ROC: {roc_auc_score(test_labels, probabilities)}")

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Assuming `feature_matrix` and `labels` are your feature and label arrays
labels = df['label'].to_list()

# Split data into training and testing sets using train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42)

# Train the XGBoost classifier
classifier = xgb.XGBClassifier(random_state=42)
classifier.fit(train_features, train_labels)

# Predict and evaluate
predictions = classifier.predict(test_features)
probabilities = classifier.predict_proba(test_features)[:, 1]  # Get probabilities for AUC

# Print accuracy
print(f"Accuracy: {accuracy_score(test_labels, predictions)}")

# Print classification report (Precision, Recall, F1-score)
print(f"Classification Report:\n{classification_report(test_labels, predictions)}")

# Print AUC-ROC score
print(f"AUC-ROC: {roc_auc_score(test_labels, probabilities)}")


In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
labels = df['label'].to_list()
train_features, test_features, train_labels, test_labels = train_test_split(
    feature_matrix, labels, test_size=0.2, random_state=42)

# Train the LightGBM classifier
classifier = lgb.LGBMClassifier(random_state=42)
classifier.fit(train_features, train_labels)

# Predict and evaluate
predictions = classifier.predict(test_features)
probabilities = classifier.predict_proba(test_features)[:, 1]

# Print accuracy
print(f"Accuracy: {accuracy_score(test_labels, predictions)}")

# Print classification report (Precision, Recall, F1-score)
print(f"Classification Report:\n{classification_report(test_labels, predictions)}")

# Print AUC-ROC score
print(f"AUC-ROC: {roc_auc_score(test_labels, probabilities)}")


In [1]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(train_features, train_labels)

# Predict and evaluate
predictions = classifier.predict(test_features)
probabilities = classifier.predict_proba(test_features)[:, 1]

# Print accuracy
print(f"Accuracy: {accuracy_score(test_labels, predictions)}")

# Print classification report (Precision, Recall, F1-score)
print(f"Classification Report:\n{classification_report(test_labels, predictions)}")

# Print AUC-ROC score
print(f"AUC-ROC: {roc_auc_score(test_labels, probabilities)}")