In [211]:
import pandas as pd
import numpy as np
import networkx as nx
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler



In [212]:


def load_data():
    fpath='/home/adityapandey/Downloads/project-sem5/Data Sets/'
    indian_students = pd.read_csv(fpath+"IndianStudentsAbroad.csv")
    cost_living = pd.read_csv(fpath+"Cost_of_Living_Index_by_Country_2024.csv")
    tuition = pd.read_csv(fpath+"International_Education_Costs.csv")
    reputation = pd.read_csv(fpath+"QS World University Rankings 2025 (Top global universities).csv",encoding='latin1')
    return indian_students, cost_living, tuition, reputation

indian_students, cost_living, tuition, reputation = load_data()
indian_students.drop(columns=['index'], inplace=True)
print(indian_students.head())



       Country  No of Indian Students  Percentage  Unnamed: 3
0  US(2015-16)                 165918   37.134985         NaN
1    Australia                  66886   14.970109   52.105095
2       Canada                  50000   11.190764   63.295859
3  New Zealand                  32000    7.162089   70.457948
4      Bahrain                  27000    6.043013   76.500961


In [213]:
order = indian_students.sort_values('No of Indian Students')['Country'].unique().tolist()
encoder = OrdinalEncoder(categories=[order])
indian_students['Country_Encoded'] = encoder.fit_transform(indian_students[['Country']])

num_cols = ['Country_Encoded','No of Indian Students','Percentage','Unnamed: 3']
scaler = MinMaxScaler()
indian_students[num_cols] = scaler.fit_transform(indian_students[num_cols])
indian_students.drop(columns=['Unnamed: 3'], inplace=True)
print(indian_students)

               Country  No of Indian Students  Percentage  Country_Encoded
0          US(2015-16)               1.000000    1.000000         1.000000
1            Australia               0.403123    0.403123         0.986486
2               Canada               0.301349    0.301349         0.972973
3          New Zealand               0.192861    0.192861         0.959459
4              Bahrain               0.162726    0.162726         0.945946
..                 ...                    ...         ...              ...
70  Bosnia Herzegovina               0.000006    0.000006         0.040541
71             Lebanon               0.000006    0.000006         0.054054
72               Chile               0.000000    0.000000         0.013514
73            Mongolia               0.000000    0.000000         0.027027
74              Serbia               0.000000    0.000000         0.000000

[75 rows x 4 columns]


In [214]:
tuition.columns = tuition.columns.str.strip().str.lower().str.replace(" ", "_")

tuition["total_cost"] = (tuition["tuition_usd"] * tuition["duration_years"]) + \
                   (tuition["rent_usd"] *tuition["duration_years"]) + \
                tuition["visa_fee_usd"] +tuition["insurance_usd"]
nums_col2=['exchange_rate','living_cost_index','tuition_usd','duration_years','rent_usd','visa_fee_usd','insurance_usd','total_cost']  
tuition[nums_col2]=scaler.fit_transform(tuition[nums_col2]) 
print(tuition.head())


     country       city                      university  \
0        USA  Cambridge              Harvard University   
1         UK     London         Imperial College London   
2     Canada    Toronto           University of Toronto   
3  Australia  Melbourne         University of Melbourne   
4    Germany     Munich  Technical University of Munich   

                  program   level  duration_years  tuition_usd  \
0        Computer Science  Master            0.25     0.893548   
1            Data Science  Master            0.00     0.664516   
2      Business Analytics  Master            0.25     0.620968   
3             Engineering  Master            0.25     0.677419   
4  Mechanical Engineering  Master            0.25     0.008065   

   living_cost_index  rent_usd  visa_fee_usd  insurance_usd  exchange_rate  \
0           0.588795  0.872340      0.266667       1.000000       0.000020   
1           0.507400  0.702128      0.988889       0.461538       0.000015   
2           0.

In [215]:
cost_living.columns = cost_living.columns.str.strip().str.lower().str.replace(" ", "_")
nums_col3=['cost_of_living_index','rent_index','cost_of_living_plus_rent_index','groceries_index','restaurant_price_index','local_purchasing_power_index']
cost_living[nums_col3]=scaler.fit_transform(cost_living[nums_col3]) 
print(cost_living.head())


   rank      country  cost_of_living_index  rent_index  \
0     1  Switzerland              1.000000    0.680556   
1     2      Bahamas              0.804374    0.529321   
2     3      Iceland              0.780073    0.567901   
3     4    Singapore              0.703524    1.000000   
4     5     Barbados              0.702309    0.256173   

   cost_of_living_plus_rent_index  groceries_index  restaurant_price_index  \
0                        1.000000         1.000000                1.000000   
1                        0.794671         0.699782                0.837292   
2                        0.797806         0.774017                0.878860   
3                        0.956113         0.623362                0.446556   
4                        0.592476         0.691048                0.672209   

   local_purchasing_power_index  
0                      0.867925  
1                      0.290233  
2                      0.654828  
3                      0.603774  
4           

In [216]:

reputation.columns = reputation.columns.str.strip()
nums_col4=['RANK_2024','Region','SIZE','FOCUS','RES.','STATUS','Academic_Reputation_Score', 'Academic_Reputation_Rank', 'Employer_Reputation_Score', 'Employer_Reputation_Rank', 'Faculty_Student_Score', 'Faculty_Student_Rank', 'Citations_per_Faculty_Score', 'Citations_per_Faculty_Rank', 'International_Faculty_Score', 'International_Faculty_Rank', 'International_Students_Score', 'International_Students_Rank', 'International_Research_Network_Score', 'International_Research_Network_Rank', 'Employment_Outcomes_Score', 'Employment_Outcomes_Rank', 'Sustainability_Score', 'Sustainability_Rank']
reputation.drop(columns=nums_col4, inplace=True)
nums_col5=['Overall_Score']
# Ensure numeric dtype, then scale
reputation[nums_col5] = scaler.fit_transform(
    reputation[nums_col5].apply(pd.to_numeric, errors='coerce')
)
print(reputation.head())
print(reputation.columns.tolist())




  RANK_2025                              Institution_Name        Location  \
0         1  Massachusetts Institute of Technology (MIT)    United States   
1         2                       Imperial College London  United Kingdom   
2         3                          University of Oxford  United Kingdom   
3         4                            Harvard University   United States   
4         5                       University of Cambridge  United Kingdom   

   Overall_Score  
0       1.000000  
1       0.981061  
2       0.960859  
3       0.959596  
4       0.958333  
['RANK_2025', 'Institution_Name', 'Location', 'Overall_Score']


In [217]:
reputation.columns = reputation.columns.str.strip().str.lower().str.replace(" ", "_")
indian_students.columns = indian_students.columns.str.strip().str.lower().str.replace(" ", "_")
df = tuition.merge(cost_living, on="country", how="left") \
            .merge(indian_students, on="country", how="left") \
            .merge(reputation, left_on="university", right_on="institution_name", how="left")
# Example: Drop rows where 'rank' is NaN (missing)
df_cleaned = df.dropna(subset=['rank_2025'])

print(df_cleaned.head())
df_cleaned.to_csv('/home/adityapandey/Downloads/project-sem5/Data Sets/merged_indian_students_abroad.csv', index=False)

       country       city                      university  \
0          USA  Cambridge              Harvard University   
1           UK     London         Imperial College London   
2       Canada    Toronto           University of Toronto   
4      Germany     Munich  Technical University of Munich   
6  Netherlands  Amsterdam         University of Amsterdam   

                   program   level  duration_years  tuition_usd  \
0         Computer Science  Master            0.25     0.893548   
1             Data Science  Master            0.00     0.664516   
2       Business Analytics  Master            0.25     0.620968   
4   Mechanical Engineering  Master            0.25     0.008065   
6  Artificial Intelligence  Master            0.00     0.254839   

   living_cost_index  rent_usd  visa_fee_usd  ...  groceries_index  \
0           0.588795  0.872340      0.266667  ...              NaN   
1           0.507400  0.702128      0.988889  ...              NaN   
2           0.472516