In [1]:
import openpyxl
import csv
import os
import pandas as pd 
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import spacy
import matplotlib.pyplot as plt
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import fitz 
from spacy import displacy
from reportlab.lib.utils import ImageReader
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPDF
!pip install svglib
!pip install reportlab
nltk.download('vader_lexicon')


def excel_to_csv(input_file, output_file):
    """Convert an Excel file to a CSV file without loading the entire file into memory."""
    if not os.path.exists(input_file):
        print(f"Error: {input_file} does not exist.")
        return

    try:
        workbook = openpyxl.load_workbook(input_file, read_only=True)
        sheet = workbook.active

        with open(output_file, "w", newline="") as csv_file:
            writer = csv.writer(csv_file)
            for row in sheet.iter_rows(values_only=True):
                writer.writerow(row)
    except Exception as e:
        print(f"Error reading {input_file}: {e}")
    finally:
        workbook.close()

excel_to_csv("AI_Engineer_Dataset_Task_1.xlsx", "AI_Engineer_Dataset_Task_1.csv")




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/abu/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Load the smaller dataset directly once and use it in subsequent parts.
df_courses = pd.read_excel("AI_Engineer_Dataset_Task_2.xlsx")

# Define chunk size
chunk_size = 100000  # Adjust this value based on your system's capabilities

# Convert ParticipantResponse to numerical scores
response_mapping = {
    'Strongly Disagree': 1,
    'Disagree': 2,
    'Neutral': 3,
    'Agree': 4,
    'Strongly Agree': 5,
    'no': 3  # Assuming "no" is a neutral response.
}

# Define dtypes for efficient memory usage
col_types = {
    'CourseCode': str,
    'CourseName': str,
    'ParticipantResponse': str  # Assuming this is the column you're mapping
}

# Initialize an empty dataframe to store the concatenated result
df_responses = pd.DataFrame()

# Read the CSV in chunks and process each chunk
for chunk in pd.read_csv("AI_Engineer_Dataset_Task_1.csv", dtype=col_types, usecols=list(col_types.keys()), chunksize=chunk_size):
    chunk['Score'] = chunk['ParticipantResponse'].map(response_mapping)
    chunk_merged = chunk.merge(df_courses, on=['CourseCode', 'CourseName'], how='left')
    df_responses = pd.concat([df_responses, chunk_merged], ignore_index=True)
    


In [3]:
# Check for any non-string values in the column
non_string_rows = df_responses[df_responses['ParticipantResponse'].apply(lambda x: not isinstance(x, str))]
# print(non_string_rows)

# Option 1: Fill NaN values with a default string
df_responses['ParticipantResponse'].fillna('No Response', inplace=True)

# Option 2 (alternative to Option 1, if we prefer to drop rows): 
# df_responses.dropna(subset=['ParticipantResponse'], inplace=True)


In [4]:
# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')

def preprocess_texts(texts):
    return [" ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha]) for doc in nlp.pipe(texts, batch_size=500)]

# Process 'ParticipantResponse' column
df_responses['ProcessedResponse'] = preprocess_texts(df_responses['ParticipantResponse'])

# Sentiment Analysis using NLTK's VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()
df_responses['Sentiment'] = df_responses['ProcessedResponse'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Save the processed DataFrame to a CSV file
df_responses.to_csv("processed_responses.csv", index=False)
    
# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(df_responses['ProcessedResponse'])

# Topic Modeling using NMF (Non-Negative Matrix Factorization)
n_topics = 5
nmf = NMF(n_components=n_topics, random_state=42).fit(tfidf)
feature_names = vectorizer.get_feature_names_out()

# Display topics
for topic_idx, topic in enumerate(nmf.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[-10:]]))
    print("\n")



Topic #1:
الدكتور method الشرح شرحه وايد مفهوم number statement agree strongly


Topic #2:
مفهوم وايد write right far evaluate teaching method survey disagree


Topic #3:
شافي كل لدي المساق تعليق توجد شكرا يوجد لا neutral


Topic #4:
instructor semester number statement write right far evaluate survey agree


Topic #5:
شافي كل لدي المساق تعليق توجد شكرا يوجد لا response




In [5]:
# Load the processed DataFrame from part 4
df_responses = pd.read_csv("processed_responses.csv")

# Print Results
best_courses = df_responses.groupby('CourseName')['Sentiment'].mean().sort_values(ascending=False).head(5)
worst_courses = df_responses.groupby('CourseName')['Sentiment'].mean().sort_values().head(5)
college_feedback = df_responses.groupby('College')['Sentiment'].mean().sort_values(ascending=False)
degree_feedback = df_responses.groupby('DegreeName')['Sentiment'].mean().sort_values(ascending=False)

# Create a PDF report
pdf_file = "sentiment_analysis_report.pdf"
c = canvas.Canvas(pdf_file, pagesize=letter)
c.setFont("Helvetica", 12)  # Set initial font size to 12
c.drawString(100, 800, "Sentiment Analysis Report")


# Add results to the PDF
y_position = 750
for section_title, data in [("Courses with Best Feedback", best_courses),
                            ("Courses with Worst Feedback", worst_courses),
                            ("Feedback by College", college_feedback),
                            ("Feedback by Degree", degree_feedback)]:
    c.setFont("Helvetica", 10)
    c.drawString(100, y_position, section_title + ":")
    y_position -= 20
    for idx, (label, value) in enumerate(data.items(), start=1):
        c.drawString(120, y_position, f"{idx}. {label}: {value:.2f}")
        y_position -= 15
        
        # Visualize the feedback text using spaCy's displacy.render
        doc = nlp(label)  # Assuming 'label' contains the feedback text
        
        # Create the directory if it doesn't exist
        if not os.path.exists("feedback_images"):
            os.makedirs("feedback_images")

        # Save the visualization as an SVG image file
        image_file = f"feedback_images/feedback_{idx - 1}.svg"
        svg = displacy.render(doc, style="dep", jupyter=False, options={'compact': True})
        with open(image_file, "w", encoding="utf-8") as f:
            f.write(svg)

        # Embed the visualization SVG in the PDF using ReportLab's drawSvg function
        drawing = svg2rlg(image_file)
        drawing.scale(0.2, 0.2) 
        renderPDF.draw(drawing, c, 400, y_position + 15)

        y_position -= 10

# Save the PDF
c.save()

# Visualize the distribution of sentiment scores and save it as a separate PDF
fig, ax = plt.subplots()
plt.hist(df_responses['Sentiment'], bins=30, alpha=0.75)
plt.title('Distribution of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Number of Responses')
plot_pdf_file = "sentiment_distribution.pdf"
plt.savefig(plot_pdf_file, format='pdf')
plt.close(fig)

# Combine the main PDF report and the plot PDF using PyMuPDF
main_pdf = fitz.open(pdf_file)
plot_pdf = fitz.open(plot_pdf_file)
output_pdf = fitz.open()

output_pdf.insert_pdf(main_pdf)
output_pdf.insert_pdf(plot_pdf)

output_pdf.save("combined_report.pdf")
