In [16]:
!pip install nltk



In [19]:
import nltk
import difflib
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from google.colab import files
import re

# Ensure necessary NLTK resources are downloaded
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [25]:

# ============================
# Helper Functions
# ============================

def read_file(file_path):
    """
    Reads the content of a file and returns it as a string.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except UnicodeDecodeError:
        with open(file_path, "r", encoding="latin-1") as file:
            return file.read()

def calculate_similarity(text1, text2):
    """
    Computes similarity between two texts using TF-IDF and cosine similarity.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return round(similarity[0][0] * 100, 2)

def detect_plagiarism(document, references):
    """
    Detects plagiarism by comparing the document against reference texts.
    Returns plagiarized sentences and their similarity scores.
    """
    sentences = nltk.sent_tokenize(document)
    plagiarized_sentences = []

    for sentence in sentences:
        for reference in references:
            similarity = calculate_similarity(sentence, reference)
            if similarity > 70:  # Threshold for plagiarism
                plagiarized_sentences.append((sentence, similarity))
                break

    return plagiarized_sentences

def analyze_sentiment(text):
    """
    Analyzes sentiment using TextBlob.
    Returns polarity, subjectivity, and overall sentiment.
    """
    blob = TextBlob(text)
    polarity = round(blob.sentiment.polarity, 2)
    subjectivity = round(blob.sentiment.subjectivity, 2)
    sentiment = "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"
    return {"Polarity": polarity, "Subjectivity": subjectivity, "Sentiment": sentiment}

def assign_grade(similarity_score):
    """
    Assigns a grade based on similarity score.
    """
    if similarity_score < 30:
        return "Grade A (Highly Original)"
    elif 30 <= similarity_score < 60:
        return "Grade B (Fairly Original)"
    elif 60 <= similarity_score < 80:
        return "Grade C (Moderate Similarity)"
    else:
        return "Grade D (High Similarity)"

def analyze_writing_style(text):
    """
    Analyzes sentence length, vocabulary richness, and part-of-speech distribution.
    """
    words = nltk.word_tokenize(text)
    sentences = nltk.sent_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    vocabulary_richness = len(set(words)) / word_count if word_count > 0 else 0
    pos_distribution = Counter(tag for _, tag in pos_tags)

    return {
        "Word Count": word_count,
        "Sentence Count": sentence_count,
        "Average Sentence Length": round(avg_sentence_length, 2),
        "Vocabulary Richness": round(vocabulary_richness, 2),
        "POS Distribution": pos_distribution
    }

def extract_topics(text, num_topics=2, num_words=5):
    """
    Performs topic modeling using Latent Dirichlet Allocation (LDA).
    Extracts main topics from the text.
    """
    vectorizer = CountVectorizer(stop_words="english")
    doc_term_matrix = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(doc_term_matrix)
    topics = lda.components_
    feature_names = vectorizer.get_feature_names_out()

    extracted_topics = []
    for topic_idx, topic in enumerate(topics):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
        extracted_topics.append(f"Topic {topic_idx + 1}: " + ", ".join(top_words))

    return extracted_topics

def generate_feedback(grade, sentiment, writing_style):
    """
    Generates short, point-wise feedback based on analysis.
    """
    grade_feedback = {
        "Grade A (Highly Original)": "✔ Original content with excellent creativity.",
        "Grade B (Fairly Original)": "✔ Good effort, but some areas need refinement.",
        "Grade C (Moderate Similarity)": "⚠ Noticeable similarities detected. Revise overlapping sections.",
        "Grade D (High Similarity)": "❌ High similarity. Major revisions are required."
    }

    sentiment_feedback = {
        "Positive": "✔ Positive tone adds an engaging feel to the document.",
        "Neutral": "⚠ Neutral tone. Adding emotional depth can enhance readability.",
        "Negative": "❌ Negative tone. Consider making the text more constructive."
    }

    style_feedback = [
        f"✔ Average sentence length: {writing_style['Average Sentence Length']} words.",
        f"✔ Vocabulary richness: {round(writing_style['Vocabulary Richness'] * 100, 2)}% unique words.",
        "⚠ Simplify complex sentences." if writing_style['Average Sentence Length'] > 20 else "✔ Sentence structure is clear and concise."
    ]

    # Combine feedback in a short and point-wise format
    feedback = [
        grade_feedback[grade],
        sentiment_feedback[sentiment["Sentiment"]],
        *style_feedback
    ]

    return "\n".join(feedback)

# ============================
# Main Program
# ============================
def main():
    print("Please upload a document for analysis:")
    uploaded_file = files.upload()

    if uploaded_file:
        # Read uploaded file
        file_name = list(uploaded_file.keys())[0]
        file_content = read_file(file_name)

        # Reference content for comparison
        reference_texts = [
            "Artificial intelligence is a fascinating field of study.",
            "Machine learning is a subset of AI that focuses on algorithms."
        ]

        # Plagiarism Detection
        plagiarized_sentences = detect_plagiarism(file_content, reference_texts)
        if plagiarized_sentences:
            print("\nPlagiarism Detected!")
            for sentence, similarity in plagiarized_sentences:
                print(f"Plagiarized: '{sentence}' (Similarity: {similarity}%)")
        else:
            print("\nNo Plagiarism Detected!")

        # Similarity Score
        similarity_score = calculate_similarity(file_content, reference_texts[0])
        print(f"\nSimilarity Score: {similarity_score}%")

        # Grading
        grade = assign_grade(similarity_score)
        print(f"Grade: {grade}")

        # Sentiment Analysis
        sentiment = analyze_sentiment(file_content)
        print(f"\nSentiment Analysis:\n  Polarity: {sentiment['Polarity']}\n"
              f"  Subjectivity: {sentiment['Subjectivity']}\n"
              f"  Sentiment: {sentiment['Sentiment']}")

        # Writing Style Analysis
        writing_style = analyze_writing_style(file_content)
        print(f"\nWriting Style Analysis:\n  Word Count: {writing_style['Word Count']}\n"
              f"  Sentence Count: {writing_style['Sentence Count']}\n"
              f"  Average Sentence Length: {writing_style['Average Sentence Length']} words\n"
              f"  Vocabulary Richness: {writing_style['Vocabulary Richness']}\n")

        # Topic Modeling
        topics = extract_topics(file_content)
        print("\nExtracted Topics:")
        for topic in topics:
            print(topic)

        # Feedback
        feedback = generate_feedback(grade, sentiment, writing_style)
        print(f"\nFeedback:\n{feedback}")


# Run the Program
if __name__ == "__main__":
    main()


Please upload a document for analysis:


Saving report file.pdf to report file (1).pdf

No Plagiarism Detected!

Similarity Score: 0.22%
Grade: Grade A (Highly Original)

Sentiment Analysis:
  Polarity: 0.74
  Subjectivity: 1.0
  Sentiment: Positive

Writing Style Analysis:
  Word Count: 46026
  Sentence Count: 199
  Average Sentence Length: 231.29 words
  Vocabulary Richness: 0.47


Extracted Topics:
Topic 1: endobj, 00000, obj, 500, type
Topic 2: endobj, 00000, obj, 500, type

Feedback:
✔ Original content with excellent creativity.
✔ Positive tone adds an engaging feel to the document.
✔ Average sentence length: 231.29 words.
✔ Vocabulary richness: 47.0% unique words.
⚠ Simplify complex sentences.
