In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import statsmodels.api as sm


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-world-university-rankings-2011-2023/2017_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2020_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2023_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2015_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2018_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2022_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2012_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2013_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2014_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2016_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2021_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2011_rankings.csv
/kaggle/input/the-world-university-rankings-2011-2023/2019_rankings.csv


In [28]:
# List of file paths
file_paths = [
    "/kaggle/input/the-world-university-rankings-2011-2023/2011_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2012_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2013_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2014_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2015_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2016_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2017_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2018_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2019_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2020_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2021_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2022_rankings.csv",
    "/kaggle/input/the-world-university-rankings-2011-2023/2023_rankings.csv"
]

In [30]:

def transform(df):
    
    
    '''
    Function for all data transformation
    '''
    
    #remove '=' in rank column
    df['rank'] = df['rank'].astype('str').str.replace("=", '')
    # fill missing value with '0' in rank column
    df['rank'].fillna(0, inplace = True)
    #convert rank column to integer
    df['rank'] = df['rank'].values.astype(int)
    #add year to dataframe column
    
    
    return df

 



def mergeData(file_paths):
    
    '''
    Function to Merge 2011- 2023 data
    
    '''
    
    transformed = pd.DataFrame()
    
    for file_path in file_paths:
        #read  individual file paths
        temp_df = pd.read_csv(file_path)[:100]
        #transform data using written function
        temp_df['year'] = file_path[-17:-13]

        temp_df = transform(temp_df)
        transformed = transformed.append(temp_df)
        
        
    return transformed

In [31]:
def filter_uni(df):
    
    '''
    Funtion to filter university with more than 1 observation when years is merged
    '''
    # filter the 'name' groups that occur more than once
    name_counts = df['name'].value_counts()
    names_to_keep = name_counts[name_counts > 1].index.tolist()

    # filter the original dataframe to only include the groups to keep
    df_filtered = df[df['name'].isin(names_to_keep)]
    # sort by name, year
    df_sorted = df_filtered.sort_values(['name', 'year'])
    
    return df_sorted
    
    
    



def modeling(df_sorted):
    '''
    Function to perform OLS and store the weights (beta values)
    '''
    
    # initialize an empty dataframe to store the beta values
    beta_df = pd.DataFrame(columns=['name', 'beta'])

    # iterate through the groups of rows with the same 'name'
    for name, group in df_sorted.groupby('name'):
        # extract the 'year' and 'rank' columns as arrays
        year = group['year'].values
        rank = group['rank'].values
        
        if (len(year) & len(rank)) > 9:
            # run a time series OLS using the 'year' as independent (t) and 'rank' as dependent (Y)
            X = sm.add_constant(np.array([int(y) for y in year]))
            model = sm.OLS(rank, X)
            results = model.fit()
            print(f"OLS Modeling for {name} , {year}")
            # store the beta value in the beta dataframe
            beta_df = beta_df.append({'name': name, 'beta': results.params[1]}, ignore_index=True)
        else:
            pass
    print("OLS Modeling Done!!!")
    
    return beta_df


if __name__ == "__main__":
    data = mergeData(file_paths)
    data_f = filter_uni(data)
    beta_df = modeling(data_f)
    print(beta_df.sort_values('beta', ascending = True).head(50))

OLS Modeling for Australian National University , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for Boston University , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for Brown University , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for California Institute of Technology , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for Carnegie Mellon University , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for Columbia University , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for Cornell University , ['2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019' '2020'
 '2021' '2022' '2023']
OLS Modeling for Delft University of Techn