In [None]:
!pip install openpyxl pandas nltk scikit-learn spacy matplotlib PyPDF2 PyMuPDF

In [None]:
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
import openpyxl
import csv
import os
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import spacy
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF
import matplotlib.pyplot as plt
import fitz

# Function to convert Excel to CSV
def excel_to_csv(input_file, output_file):
    if not os.path.exists(input_file):
        print(f"Error: {input_file} does not exist.")
        return

    try:
        workbook = openpyxl.load_workbook(input_file, read_only=True)
        sheet = workbook.active

        with open(output_file, "w", newline="") as csv_file:
            writer = csv.writer(csv_file)
            for row in sheet.iter_rows(values_only=True):
                writer.writerow(row)
    except Exception as e:
        print(f"Error reading {input_file}: {e}")
    finally:
        workbook.close()

# Convert Excel to CSV
excel_to_csv("AI_Engineer_Dataset_Task_1.xlsx", "AI_Engineer_Dataset_Task_1.csv")


In [None]:
# Load the second Excel file
df_courses = pd.read_excel("AI_Engineer_Dataset_Task_2.xlsx")

# Define chunk size for reading large datasets
chunk_size = 100000

# Mapping for ParticipantResponse
response_mapping = {
    'Strongly Disagree': 1,
    'Disagree': 2,
    'Neutral': 3,
    'Agree': 4,
    'Strongly Agree': 5,
    'no': 3
}

# Define column data types for memory efficiency
col_types = {
    'CourseCode': str,
    'CourseName': str,
    'ParticipantResponse': str
}

# Initialize an empty DataFrame for concatenating the chunks
df_responses = pd.DataFrame()


# Read and process the CSV in chunks
for chunk in pd.read_csv("AI_Engineer_Dataset_Task_1.csv", dtype=col_types, usecols=list(col_types.keys()), chunksize=chunk_size):
    chunk['Score'] = chunk['ParticipantResponse'].map(response_mapping)
    chunk_merged = chunk.merge(df_courses, on=['CourseCode', 'CourseName'], how='left')
    df_responses = pd.concat([df_responses, chunk_merged], ignore_index=True)

# Now, df_responses_processed contains your fully processed data


In [None]:
# ... previous parts

# Now, df_responses_processed contains your fully processed data

# Part 3: Handle non-string and missing values

# Check for any non-string values in the 'ParticipantResponse' column
non_string_rows = df_responses[df_responses['ParticipantResponse'].apply(lambda x: not isinstance(x, str))]

# Option 1: Fill NaN values with a default string
df_responses['ParticipantResponse'].fillna('No Response', inplace=True)

# Option 2 (alternative to Option 1, if you prefer to drop rows): 
# df_responses_processed.dropna(subset=['ParticipantResponse'], inplace=True)

# Now, df_responses_processed is cleaned up and ready for further analysis


In [None]:
!pip install openpyxl
print("df_responses is initialized: ", 'df_responses' in locals())


# Part 4: Data Preprocessing, Sentiment Analysis, and Topic Modeling

# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')

def preprocess_texts(texts):
    return [" ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha]) for doc in nlp.pipe(texts, batch_size=500)]

# Process 'ParticipantResponse' column
df_responses['ProcessedResponse'] = preprocess_texts(df_responses['ParticipantResponse'])

# Sentiment Analysis using NLTK's VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()
df_responses['Sentiment'] = df_responses['ProcessedResponse'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(df_responses['ProcessedResponse'])

# Topic Modeling using NMF (Non-Negative Matrix Factorization)
n_topics = 5  # You can change this number based on your specific needs
nmf = NMF(n_components=n_topics, random_state=42).fit(tfidf)
feature_names = vectorizer.get_feature_names_out()

# Display topics
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[-10:]]))
    print("\n")

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.graphics import renderPDF
from svglib.svglib import svg2rlg
import matplotlib.pyplot as plt
import fitz 

df_responses = pd.read_csv("processed_responses.csv")

# Initialize variables
pdf_file = "sentiment_analysis_report.pdf"
c = canvas.Canvas(pdf_file, pagesize=letter)
c.setFont("Helvetica", 12)  # Set initial font size to 12

# Add Title
c.drawString(100, 800, "Sentiment Analysis Report")

# Print Results
best_courses = df_responses.groupby('CourseName')['Sentiment'].mean().sort_values(ascending=False).head(5)
worst_courses = df_responses.groupby('CourseName')['Sentiment'].mean().sort_values().head(5)
college_feedback = df_responses.groupby('College')['Sentiment'].mean().sort_values(ascending=False)
degree_feedback = df_responses.groupby('DegreeName')['Sentiment'].mean().sort_values(ascending=False)


# Add results to the PDF
y_position = 750
for section_title, data in [("Courses with Best Feedback", best_courses),
                            ("Courses with Worst Feedback", worst_courses),
                            ("Feedback by College", college_feedback),
                            ("Feedback by Degree", degree_feedback)]:
    c.setFont("Helvetica", 10)
    c.drawString(100, y_position, section_title + ":")
    y_position -= 20
    for idx, (label, value) in enumerate(data.items(), start=1):
        c.drawString(120, y_position, f"{idx}. {label}: {value:.2f}")
        y_position -= 15
        
        # Visualize the feedback text using spaCy's displacy.render
        doc = nlp(label)  # Assuming 'label' contains the feedback text
        
        # Create the directory if it doesn't exist
        if not os.path.exists("feedback_images"):
            os.makedirs("feedback_images")

        # Save the visualization as an SVG image file
        image_file = f"feedback_images/feedback_{idx - 1}.svg"
        svg = displacy.render(doc, style="dep", jupyter=False, options={'compact': True})
        with open(image_file, "w", encoding="utf-8") as f:
            f.write(svg)

        # Embed the visualization SVG in the PDF using ReportLab's drawSvg function
        drawing = svg2rlg(image_file)
        drawing.scale(0.2, 0.2) 
        renderPDF.draw(drawing, c, 400, y_position + 15)

        y_position -= 10

c.showPage()

# Add section for NLP Techniques
y_position = 750  # Starting Y-position
c.setFont("Helvetica", 8)
c.drawString(100, y_position, "NLP Techniques Used:")
y_position -= 20
nlp_techniques = [
    "Data Preprocessing: Lemmatization, removal of stop words, and filtering to alphabetic tokens.",
    "Sentiment Analysis: Used VADER sentiment analyzer to evaluate the polarity of the processed text.",
    "Feature Extraction: Employed TF-IDF to transform the text data into feature vectors.",
    "Topic Modeling: Applied NMF for extracting the underlying topics."
]

for technique in nlp_techniques:
    c.setFont("Helvetica", 8)
    c.drawString(120, y_position, technique)
    y_position -= 20

# Add a space between sections
y_position -= 20

# Add section for Findings
c.setFont("Helvetica", 10)
c.drawString(100, y_position, "Findings:")
y_position -= 20
findings = [
    "Best and Worst Courses: 'Course A' received the highest sentiment score, suggesting positive feedback.",
    "Feedback by College: 'College X' had the most positive feedback, whereas 'College Y' had the most negative.",
    "Feedback by Degree: Feedback for 'Master's Degree' courses was generally more positive than that for 'Bachelor's Degree' courses."
]

for finding in findings:
    c.drawString(120, y_position, finding)
    y_position -= 20
    

    
# Save the PDF
c.save()

# Visualize the distribution of sentiment scores and save it as a separate PDF
fig, ax = plt.subplots()
plt.hist(df_responses['Sentiment'], bins=30, alpha=0.75)
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Number of Responses')
plot_pdf_file = "sentiment_distribution.pdf"
plt.savefig(plot_pdf_file, format='pdf')
plt.close(fig)

# Combine the main PDF report and the plot PDF using PyMuPDF
main_pdf = fitz.open(pdf_file)
plot_pdf = fitz.open(plot_pdf_file)
output_pdf = fitz.open()

output_pdf.insert_pdf(main_pdf)
output_pdf.insert_pdf(plot_pdf)

output_pdf.save("combined_report.pdf")
