# Big Data Project - Student City Scorer

Student City Scorer is a digital tool designed to help prospective university students choose the best city to study in. By comparing cities across multiple dimensions — university quality, affordability, safety, and weather — the platform delivers a personalized and data-driven recommendation experience.

In [1]:
#Load Datasets
import pandas as pd

# University Rank dataset
uni_rank = pd.read_csv('Datasets/UniRanks.csv')
# To get only the last year available
uni_rank = uni_rank[uni_rank['year'] == 2015]
uni_rank = uni_rank.reset_index(drop=True)

# Cost of living dataset
cost_of_living = pd.read_csv('Datasets/cost-of-living_V2.csv')
unique_cities = cost_of_living['city'].unique()

# Safety dataset
safety = pd.read_csv('Datasets/Quality_of_life.csv')
safety['country'] = safety['country'].replace('United States', 'USA')

## The code beneath does not need to be run again as the runtime is very long. The code below was used to retrieve all the cities for our dataset. The created dataset was then saved as a new csv file.


In [2]:
# # University ranking dataset does not have the city of the uni but only the country, we are going to use an API to extract the address of each university and add the city into a new column.

# # Extract the cities from the cost_of_living dataset, to get a list of key words to look for in the address we get from the API and make sure the datasets are comparable.
# unique_cities = cost_of_living['city'].unique()
# print(unique_cities)

# from geopy.geocoders import Nominatim
# import time
# import requests

# # Initialize geolocator with a custom user agent
# geolocator = Nominatim(user_agent="uni_city_locator_v3")

# # List of city keywords to look for in the address (optional, for better matching)
# city_keywords = unique_cities

# # Function to get city from the university name and country
# def get_city_from_address(institution, country, retries=3, delay=10):
#     attempt = 0
#     while attempt < retries:
#         try:
#             # Concatenate institution and country for better matching
#             location = geolocator.geocode(f"{institution}, {country}")
#             if location:
#                 address = location.address
#                 # Check for city keyword in address
#                 for city in city_keywords:
#                     if city.lower() in address.lower():  # case-insensitive match
#                         return city
#             return None
#         except Exception as e:
#             print(f"Error with {institution}: {e}. Retrying {attempt+1}/{retries}...")
#             attempt += 1
#             time.sleep(delay)  # Delay between retries
#     return None  # Return None if max retries are exhausted

# # Apply the function to each row in your DataFrame
# uni_rank['City'] = uni_rank.apply(lambda row: get_city_from_address(row['institution'], row['country']), axis=1)

# # Print the updated dataframe with the 'City' column
# print(uni_rank.head())

In [3]:
#uni_rank

In [4]:
# So we dont have to run the code above every time, we'll save the new dataset
# uni_rank.to_csv('uni_rank2.csv', index=False)

In [5]:
import pandas as pd
# Load the new dataset created above
uni_rank = pd.read_csv('Datasets/uni_rank2.csv')
uni_rank

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year,City
0,1,Harvard University,USA,1,1,1,1,1,1,1,1.0,3,100.00,2015,Riverside
1,2,Stanford University,USA,2,9,2,4,5,3,3,4.0,10,98.66,2015,Cali
2,3,Massachusetts Institute of Technology,USA,3,3,11,2,15,2,2,2.0,1,97.54,2015,Itu
3,4,University of Cambridge,United Kingdom,1,2,10,5,11,6,12,13.0,48,96.81,2015,Homs
4,5,University of Oxford,United Kingdom,2,7,13,10,7,12,7,9.0,15,96.46,2015,Oxford
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0,816,44.03,2015,
996,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0,871,44.03,2015,
997,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0,824,44.03,2015,Jos
998,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0,651,44.02,2015,


In [6]:
# Want to save the safety value, safety category, normalized safety - from the safety dataset
# Normalized Meal, Normalized Beer, Normalized Apartment, Normalized Cost of Living, x1, x4, x48 - from the cost of living dataset
# World rank, institution, country, quality_of education, normalized rank - from the uni_rank dataset
final_dataset = uni_rank[['world_rank', 'institution', 'country', 'City', 'quality_of_education']]
final_dataset.rename(columns={'City': 'city'}, inplace=True)

# merging by the 'city' column
final_dataset = pd.merge(final_dataset, cost_of_living[['city', 'x1', 'x4', 'x48']], 
                     left_on='city', right_on='city', how='left')

# Merge safety dataset based on the 'ountry' column
final_dataset = pd.merge(final_dataset, safety[['country', 'Safety Value', 'Safety Category']], 
                    left_on='country', right_on='country', how='left')

final_dataset.rename(columns={'x1': 'Meal Price', 'x4': 'Beer Price', 'x48': 'Apartment Price' }, inplace=True)

# final_dataset.to_csv('final_dataset.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset.rename(columns={'City': 'city'}, inplace=True)


In [7]:
final_dataset

Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category
0,1,Harvard University,USA,Riverside,1,17.00,7.00,1856.50,50.74,'Moderate'
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate'
2,3,Massachusetts Institute of Technology,USA,Itu,3,5.75,2.01,263.44,50.74,'Moderate'
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate'
4,5,University of Oxford,United Kingdom,Oxford,7,15.38,6.15,1437.56,51.88,'Moderate'
...,...,...,...,...,...,...,...,...,...,...
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High'
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate'
1054,998,Federal University of Ceará,Brazil,Jos,367,1.58,0.68,675.42,35.36,'Low'
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High'


In [8]:
# create a normalized score so we can then multiply with the others score that we going to create in the different datasets
import pandas as pd
from sklearn.preprocessing import MinMaxScaler # for normalization

final_dataset = pd.read_csv('Datasets/final_dataset.csv')

# Initialize a MinMaxScaler for normalization
scaler = MinMaxScaler()

# Normalize the university rank (assuming 'world_rank' column exists and lower is better)
final_dataset['Norm Rank'] = 1 - scaler.fit_transform(final_dataset[['world_rank']])

# Normalize the safety (assuming 'safety' column exists)
final_dataset['Norm Safety'] = scaler.fit_transform(final_dataset[['Safety Value']])


# For the effect of the of cost of living, we will use this 3 metrics, price of a meal at a restaurant, price of a beer and the cost of a 1 bedroom apartment:
    # x1	Meal, Inexpensive Restaurant (USD)
    # x4	Domestic Beer (0.5 liter draught, in restaurants) (USD)
    # x48	Apartment (1 bedroom) in City Centre (USD)

final_dataset['Norm Meal'] = 1 - scaler.fit_transform(final_dataset[['Meal Price']])
final_dataset['Norm Beer'] = 1 - scaler.fit_transform(final_dataset[['Beer Price']])
final_dataset['Norm Apartment'] = 1 - scaler.fit_transform(final_dataset[['Apartment Price']])

# can then calculate the average of these, or give weights based on the importance of each
final_dataset['Norm Cost of Living'] = (
    final_dataset['Norm Meal'] + 
    final_dataset['Norm Beer'] + 
    final_dataset['Norm Apartment']
) / 3
final_dataset


Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category,Norm Rank,Norm Safety,Norm Meal,Norm Beer,Norm Apartment,Norm Cost of Living
0,1,Harvard University,USA,Riverside,1,17.00,7.00,1856.50,50.74,'Moderate',1.000000,0.429660,0.459340,0.330645,0.774089,0.521358
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate',0.998999,0.429660,0.962913,0.951613,0.981239,0.965255
2,3,Massachusetts Institute of Technology,USA,Itu,3,5.75,2.01,263.44,50.74,'Moderate',0.997998,0.429660,0.842123,0.833669,0.974309,0.883367
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate',0.996997,0.448959,0.834297,0.785282,0.997783,0.872454
4,5,University of Oxford,United Kingdom,Oxford,7,15.38,6.15,1437.56,51.88,'Moderate',0.995996,0.448959,0.514461,0.416331,0.826742,0.585845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High',0.004004,0.724733,,,,
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate',0.003003,0.461825,,,,
1054,998,Federal University of Ceará,Brazil,Jos,367,1.58,0.68,675.42,35.36,'Low',0.002002,0.169291,0.984008,0.967742,0.922530,0.958093
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High',0.001001,0.645844,,,,


# Weather API

Used the open weather map API, to get the weather for each city

In [9]:
#pip install requests

In [10]:
import requests

# Replace with your OpenWeatherMap API key
api_key = 'YOUR_API_KEY_HERE'  # Replace this with your actual API key
base_url = 'http://api.openweathermap.org/data/2.5/weather?'

# Function to get weather data
def get_weather(city):
    # Construct the URL to make the API request
    url = f'{base_url}q={city}&appid={api_key}&units=metric'
    
    # Send the request and get the response
    response = requests.get(url)
    
    # Print the status code and response to debug
    print(f"Request URL: {url}")
    print(f"Response Status Code: {response.status_code}")
    
    if response.status_code == 200:
        data = response.json()
        # Extract relevant weather information
        temperature = data['main']['temp']
        weather_condition = data['weather'][0]['description']
        return f"{temperature}°C, {weather_condition}"
    else:
        # Print the response to understand what went wrong
        print(f"Error response: {response.json()}")
        return "Data not available"

# Iterate over the cities in the dataset and get the weather for each
final_dataset['weather'] = final_dataset['city'].apply(get_weather)

# Print the updated dataset
final_dataset

Request URL: http://api.openweathermap.org/data/2.5/weather?q=Riverside&appid=YOUR_API_KEY_HERE&units=metric
Response Status Code: 401
Error response: {'cod': 401, 'message': 'Invalid API key. Please see https://openweathermap.org/faq#error401 for more info.'}
Request URL: http://api.openweathermap.org/data/2.5/weather?q=Cali&appid=YOUR_API_KEY_HERE&units=metric
Response Status Code: 401
Error response: {'cod': 401, 'message': 'Invalid API key. Please see https://openweathermap.org/faq#error401 for more info.'}
Request URL: http://api.openweathermap.org/data/2.5/weather?q=Itu&appid=YOUR_API_KEY_HERE&units=metric
Response Status Code: 401
Error response: {'cod': 401, 'message': 'Invalid API key. Please see https://openweathermap.org/faq#error401 for more info.'}
Request URL: http://api.openweathermap.org/data/2.5/weather?q=Homs&appid=YOUR_API_KEY_HERE&units=metric
Response Status Code: 401
Error response: {'cod': 401, 'message': 'Invalid API key. Please see https://openweathermap.org/fa

KeyboardInterrupt: 

# This is with the real-time weather conditions

To get like the annual average, we need the premium plan, because it allows us to get more data. We can extrapolate with this data and then say that it would be possible to do this with the annual average or even by season.


This accounts for: 

1. Volume: Access to a large number of weather data from thousands of cities around the world. Real-time, historical and forecasts. This constitutes as big volume.

2. Variety: Temperature, perceived temperature, weather conditions (clear sky, light rain), humidity, wind, atmospheric pressure, cloudiness, sunrise and sunset. 

3. Velocity: Real-time data and forecast data, enabling fast data collection for analysis. 

4. Veracity: Comes from reliable sources and is continuously updated

5. Value: Valuable for our research and will add value to the user experience of the product. 

In [None]:
# Extract the temperature from the 'weather' column
final_dataset['temperature'] = final_dataset['weather'].str.extract(r'([+-]?\d+\.\d+)').astype(float)
final_dataset

Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category,Norm Rank,Norm Safety,Norm Meal,Norm Beer,Norm Apartment,Norm Cost of Living,weather,temperature
0,1,Harvard University,USA,Riverside,1,17.00,7.00,1856.50,50.74,'Moderate',1.000000,0.429660,0.459340,0.330645,0.774089,0.521358,"14.63°C, clear sky",14.63
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate',0.998999,0.429660,0.962913,0.951613,0.981239,0.965255,"17.06°C, broken clouds",17.06
2,3,Massachusetts Institute of Technology,USA,Itu,3,5.75,2.01,263.44,50.74,'Moderate',0.997998,0.429660,0.842123,0.833669,0.974309,0.883367,"17.17°C, clear sky",17.17
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate',0.996997,0.448959,0.834297,0.785282,0.997783,0.872454,"23.2°C, scattered clouds",23.20
4,5,University of Oxford,United Kingdom,Oxford,7,15.38,6.15,1437.56,51.88,'Moderate',0.995996,0.448959,0.514461,0.416331,0.826742,0.585845,"15.13°C, clear sky",15.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High',0.004004,0.724733,,,,,"28.58°C, overcast clouds",28.58
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate',0.003003,0.461825,,,,,"28.58°C, overcast clouds",28.58
1054,998,Federal University of Ceará,Brazil,Jos,367,1.58,0.68,675.42,35.36,'Low',0.002002,0.169291,0.984008,0.967742,0.922530,0.958093,"34.06°C, broken clouds",34.06
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High',0.001001,0.645844,,,,,"28.58°C, overcast clouds",28.58


In [None]:
# Normalize temperature
final_dataset['Norm Temperature'] = scaler.fit_transform(final_dataset[['temperature']])

# Apply penalty for too high temperatures (>30°C)
penalty_factor = 0.7
final_dataset.loc[final_dataset['temperature'] > 30, 'Norm Temperature'] *= penalty_factor

final_dataset

Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category,Norm Rank,Norm Safety,Norm Meal,Norm Beer,Norm Apartment,Norm Cost of Living,weather,temperature,Norm Temperature
0,1,Harvard University,USA,Riverside,1,17.00,7.00,1856.50,50.74,'Moderate',1.000000,0.429660,0.459340,0.330645,0.774089,0.521358,"14.63°C, clear sky",14.63,0.376623
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate',0.998999,0.429660,0.962913,0.951613,0.981239,0.965255,"17.06°C, broken clouds",17.06,0.428359
2,3,Massachusetts Institute of Technology,USA,Itu,3,5.75,2.01,263.44,50.74,'Moderate',0.997998,0.429660,0.842123,0.833669,0.974309,0.883367,"17.17°C, clear sky",17.17,0.430700
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate',0.996997,0.448959,0.834297,0.785282,0.997783,0.872454,"23.2°C, scattered clouds",23.20,0.559080
4,5,University of Oxford,United Kingdom,Oxford,7,15.38,6.15,1437.56,51.88,'Moderate',0.995996,0.448959,0.514461,0.416331,0.826742,0.585845,"15.13°C, clear sky",15.13,0.387268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High',0.004004,0.724733,,,,,"28.58°C, overcast clouds",28.58,0.673621
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate',0.003003,0.461825,,,,,"28.58°C, overcast clouds",28.58,0.673621
1054,998,Federal University of Ceará,Brazil,Jos,367,1.58,0.68,675.42,35.36,'Low',0.002002,0.169291,0.984008,0.967742,0.922530,0.958093,"34.06°C, broken clouds",34.06,0.553204
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High',0.001001,0.645844,,,,,"28.58°C, overcast clouds",28.58,0.673621


In [None]:
# pip install pycountry-convert

Collecting pycountry-convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl.metadata (7.2 kB)
Collecting pprintpp>=0.3.0 (from pycountry-convert)
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl.metadata (7.9 kB)
Collecting pycountry>=16.11.27.1 (from pycountry-convert)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Collecting pytest>=3.4.0 (from pycountry-convert)
  Downloading pytest-8.3.5-py3-none-any.whl.metadata (7.6 kB)
Collecting pytest-mock>=1.6.3 (from pycountry-convert)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pytest-cov>=2.5.1 (from pycountry-convert)
  Downloading pytest_cov-6.1.1-py3-none-any.whl.metadata (28 kB)
Collecting repoze.lru>=0.7 (from pycountry-convert)
  Downloading repoze.lru-0.7-py3-none-any.whl.metadata (1.1 kB)
Collecting iniconfig (from pytest>=3.4.0->pycountry-convert)
  Downloading iniconfig-2.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting pluggy<2,>=1.5 (from pytest>=3.4.0->pycountry-co


[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Add the continent to the dataset- will be used as filter option for the users
# To get the continent from the country, we can use the pycountry_convert library

import pycountry_convert as pc

def country_to_continent(country):
    try:
        country_code = pc.country_name_to_country_alpha2(country)
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        return pc.convert_continent_code_to_continent_name(continent_code)
    except:
        return "Unknown"

final_dataset["continent"] = final_dataset["country"].apply(country_to_continent)

In [None]:
final_dataset

Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category,Norm Rank,Norm Safety,Norm Meal,Norm Beer,Norm Apartment,Norm Cost of Living,weather,temperature,Norm Temperature,continent
0,1,Harvard University,USA,Riverside,1,17.00,7.00,1856.50,50.74,'Moderate',1.000000,0.429660,0.459340,0.330645,0.774089,0.521358,"14.63°C, clear sky",14.63,0.376623,North America
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate',0.998999,0.429660,0.962913,0.951613,0.981239,0.965255,"17.06°C, broken clouds",17.06,0.428359,North America
2,3,Massachusetts Institute of Technology,USA,Itu,3,5.75,2.01,263.44,50.74,'Moderate',0.997998,0.429660,0.842123,0.833669,0.974309,0.883367,"17.17°C, clear sky",17.17,0.430700,North America
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate',0.996997,0.448959,0.834297,0.785282,0.997783,0.872454,"23.2°C, scattered clouds",23.20,0.559080,Europe
4,5,University of Oxford,United Kingdom,Oxford,7,15.38,6.15,1437.56,51.88,'Moderate',0.995996,0.448959,0.514461,0.416331,0.826742,0.585845,"15.13°C, clear sky",15.13,0.387268,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High',0.004004,0.724733,,,,,"28.58°C, overcast clouds",28.58,0.673621,Europe
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate',0.003003,0.461825,,,,,"28.58°C, overcast clouds",28.58,0.673621,Africa
1054,998,Federal University of Ceará,Brazil,Jos,367,1.58,0.68,675.42,35.36,'Low',0.002002,0.169291,0.984008,0.967742,0.922530,0.958093,"34.06°C, broken clouds",34.06,0.553204,South America
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High',0.001001,0.645844,,,,,"28.58°C, overcast clouds",28.58,0.673621,Europe


In [None]:
# Set the weights for each factor - Should add to one
w_temp = 0.2
w_cost = 0.3
w_rank = 0.4
w_safety = 0.1

if (w_temp + w_cost + w_rank + w_safety) != 1:
    print('Warning: The sum of the weights do not add to 1!')

In [None]:
# Calculate the final score using the weights and the scores done

final_dataset['Final Score'] = (
    w_cost * final_dataset['Norm Cost of Living'] +
    w_safety * final_dataset['Norm Safety'] +
    w_rank * final_dataset['Norm Rank'] +
    w_temp * final_dataset['Norm Temperature']
)

final_dataset

Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category,...,Norm Safety,Norm Meal,Norm Beer,Norm Apartment,Norm Cost of Living,weather,temperature,Norm Temperature,continent,Final Score
0,1,Harvard University,USA,Riverside,1,17.00,7.00,1856.50,50.74,'Moderate',...,0.429660,0.459340,0.330645,0.774089,0.521358,"14.63°C, clear sky",14.63,0.376623,North America,0.674698
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate',...,0.429660,0.962913,0.951613,0.981239,0.965255,"17.06°C, broken clouds",17.06,0.428359,North America,0.817814
2,3,Massachusetts Institute of Technology,USA,Itu,3,5.75,2.01,263.44,50.74,'Moderate',...,0.429660,0.842123,0.833669,0.974309,0.883367,"17.17°C, clear sky",17.17,0.430700,North America,0.793315
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate',...,0.448959,0.834297,0.785282,0.997783,0.872454,"23.2°C, scattered clouds",23.20,0.559080,Europe,0.817247
4,5,University of Oxford,United Kingdom,Oxford,7,15.38,6.15,1437.56,51.88,'Moderate',...,0.448959,0.514461,0.416331,0.826742,0.585845,"15.13°C, clear sky",15.13,0.387268,Europe,0.696501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High',...,0.724733,,,,,"28.58°C, overcast clouds",28.58,0.673621,Europe,
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate',...,0.461825,,,,,"28.58°C, overcast clouds",28.58,0.673621,Africa,
1054,998,Federal University of Ceará,Brazil,Jos,367,1.58,0.68,675.42,35.36,'Low',...,0.169291,0.984008,0.967742,0.922530,0.958093,"34.06°C, broken clouds",34.06,0.553204,South America,0.415799
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High',...,0.645844,,,,,"28.58°C, overcast clouds",28.58,0.673621,Europe,


In [None]:
#sort final dataset by final score
final_dataset = final_dataset.sort_values(by='Final Score', ascending=False)
final_dataset

Unnamed: 0,world_rank,institution,country,city,quality_of_education,Meal Price,Beer Price,Apartment Price,Safety Value,Safety Category,...,Norm Safety,Norm Meal,Norm Beer,Norm Apartment,Norm Cost of Living,weather,temperature,Norm Temperature,continent,Final Score
1,2,Stanford University,USA,Cali,9,2.20,0.84,208.30,50.74,'Moderate',...,0.429660,0.962913,0.951613,0.981239,0.965255,"17.06°C, broken clouds",17.06,0.428359,North America,0.817814
3,4,University of Cambridge,United Kingdom,Homs,2,5.98,2.49,76.67,51.88,'Moderate',...,0.448959,0.834297,0.785282,0.997783,0.872454,"23.2°C, scattered clouds",23.20,0.559080,Europe,0.817247
6,7,"University of California, Berkeley",USA,Cali,5,2.20,0.84,208.30,50.74,'Moderate',...,0.429660,0.962913,0.951613,0.981239,0.965255,"17.06°C, broken clouds",17.06,0.428359,North America,0.815812
79,77,University of Notre Dame,USA,Jos,221,1.58,0.68,675.42,50.74,'Moderate',...,0.429660,0.984008,0.967742,0.922530,0.958093,"34.06°C, broken clouds",34.06,0.553204,North America,0.810604
43,42,McGill University,Canada,Nis,34,6.28,1.35,255.25,54.34,'Moderate',...,0.490604,0.824090,0.900202,0.975338,0.899877,"21.91°C, clear sky",21.91,0.531616,North America,0.808930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1051,995,King Abdulaziz University,Saudi Arabia,,367,,,,76.23,'High',...,0.861182,,,,,"28.58°C, overcast clouds",28.58,0.673621,Asia,
1052,996,University of the Algarve,Portugal,,367,,,,68.17,'High',...,0.724733,,,,,"28.58°C, overcast clouds",28.58,0.673621,Europe,
1053,997,Alexandria University,Egypt,,236,,,,52.64,'Moderate',...,0.461825,,,,,"28.58°C, overcast clouds",28.58,0.673621,Africa,
1055,999,University of A Coruña,Spain,,367,,,,63.51,'High',...,0.645844,,,,,"28.58°C, overcast clouds",28.58,0.673621,Europe,


In [None]:
# Step 1: Sort by city and world_rank (ascending = better rank first)
dataset_sorted = final_dataset.sort_values(by=["city", "world_rank"])

# Step 2: Get top 3 universities per city
top3_unis = dataset_sorted.groupby("city").head(3)

# Step 3: Create a new column combining university name and world rank
top3_unis.loc[:, "institution_with_rank"] = (
    top3_unis["institution"] + " (#" + top3_unis["world_rank"].astype(str) + ")"
)

# Step 4: Aggregate data by city
city_df = top3_unis.groupby("city").agg({
    "country": "first",
    "continent": "first",                #
    "Norm Rank": "mean",
    "Norm Cost of Living": "first",
    "Norm Safety": "first",
    "temperature": "first",
    "Norm Temperature": "first",
    "institution_with_rank": lambda x: ', '.join(x)
}).rename(columns={"institution_with_rank": "Top Universities"})

# Step 5: Reset index so 'city' becomes a column
city_df = city_df.reset_index()

# Step 6: Fill NaN with 0 before calculating final score
city_df = city_df.fillna(0)


# Step 7: Calculate the final score for each city
# Compute final score for each city
# Set the weights for each factor (sum should ideally be 1)
w_rank = 0.25     
w_cost = 0.25      
w_safety = 0.25
w_temperature = 0.25


# Compute the final score as a weighted sum of the normalized values
city_df['Final Score'] = (w_rank * city_df['Norm Rank'] +
                           w_cost * city_df['Norm Cost of Living'] +
                           w_safety * city_df['Norm Safety'] + 
                           w_temperature * city_df['Norm Temperature'])

# Step 8: Sort cities by Final Score descending
city_df = city_df.sort_values(by="Final Score", ascending=False)

# Display
city_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top3_unis.loc[:, "institution_with_rank"] = (


Unnamed: 0,city,country,continent,Norm Rank,Norm Cost of Living,Norm Safety,temperature,Norm Temperature,Top Universities,Final Score
268,Singapore,Singapore,Asia,0.935936,0.538498,0.797698,28.83,0.678944,National University of Singapore (#65),0.737769
135,Homs,United Kingdom,Europe,0.996997,0.872454,0.448959,23.20,0.559080,University of Cambridge (#4),0.719373
132,Hiroshima,Japan,Asia,0.710711,0.829528,0.875402,17.64,0.440707,Hiroshima University (#290),0.714087
64,Campo Grande,Portugal,Europe,0.743744,0.905478,0.724733,19.53,0.480945,University of Lisbon (#257),0.713725
216,Nis,Canada,North America,0.914248,0.899877,0.490604,21.91,0.531616,"McGill University (#42), Technical University ...",0.709086
...,...,...,...,...,...,...,...,...,...,...
108,Fairbanks,USA,North America,0.409409,0.520169,0.429660,-3.06,0.000000,University of Alaska Fairbanks (#591),0.339810
102,Elmont,Australia,Oceania,0.405405,0.000000,0.463010,14.83,0.380881,Curtin University (#595),0.312324
305,Webster,USA,North America,0.332332,0.000000,0.429660,10.75,0.294017,"University of Maine, Orono (#668)",0.264002
182,Lowell,USA,North America,0.116116,0.000000,0.429660,7.91,0.233553,University of Massachusetts Lowell (#884),0.194832


In [None]:
# Export the city_df to a CSV file
city_df.to_csv('city_df.csv', index=False)