In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = "QS World University Rankings 2025.csv"  # Replace this with the dataset's path
try:
    data = pd.read_csv(file_path)
except FileNotFoundError:
    print("Dataset file not found. Please make sure it is in the same directory as this notebook.")

# Clean dataset
data_cleaned = data.iloc[3:]
data_cleaned.columns = data.iloc[2]
data_cleaned = data_cleaned.rename(columns=lambda x: x.strip() if isinstance(x, str) else x)
data_cleaned.reset_index(drop=True, inplace=True)

columns_to_keep = [
    "rank display", "institution", "location", "size", "focus", "research",
    "irn score", "ger score", "SUS SCORE", "Overall Score"
]
data_cleaned = data_cleaned[columns_to_keep]
data_cleaned = data_cleaned.dropna(how="all")

numerical_columns = ["irn score", "ger score", "SUS SCORE", "Overall Score"]
data_cleaned[numerical_columns] = data_cleaned[numerical_columns].apply(pd.to_numeric, errors="coerce")

# Visualization 1: Top 10 Universities by Overall Score
top_10 = data_cleaned.nlargest(10, "Overall Score")
plt.figure(figsize=(12, 6))
sns.barplot(x="Overall Score", y="institution", data=top_10, palette="viridis")
plt.title("Top 10 Universities by Overall Score", fontsize=14)
plt.xlabel("Overall Score", fontsize=12)
plt.ylabel("Institution", fontsize=12)
plt.show()

# Visualization 2: Distribution of Sustainability Scores (SUS SCORE)
plt.figure(figsize=(10, 6))
sns.histplot(data_cleaned["SUS SCORE"].dropna(), kde=True, color="blue", bins=20)
plt.title("Distribution of Sustainability Scores (SUS SCORE)", fontsize=14)
plt.xlabel("SUS SCORE", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.show()

# Visualization 3: Average IRN Score by Region
average_irn_by_region = data_cleaned.groupby("location")["irn score"].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
average_irn_by_region.plot(kind="bar", color="teal")
plt.title("Average IRN Score by Region", fontsize=14)
plt.xlabel("Region", fontsize=12)
plt.ylabel("Average IRN Score", fontsize=12)
plt.xticks(rotation=45)
plt.show()

# Insights:
print("\nInsights:")
print("1. The top 10 universities by overall score are dominated by well-known institutions, with MIT ranked first.")
print("2. The distribution of sustainability scores shows that most universities have scores above 80, indicating high sustainability awareness.")
print("3. Regions such as North America and Europe lead in average IRN scores, reflecting strong international research networks.")
