### Datasets:

 https://www.kaggle.com/datasets/flyingwombat/us-news-and-world-reports-college-data?resource=download

 Andrew G. Reiter, “U.S. News & World Report Historical Liberal Arts College and University Rankings,” available at: http://andyreiter.com/datasets/

In [7]:
import pandas as pd
from math import isnan

import pickle

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing

## Loading Data into Memory

In [8]:
# Loads college features into a dataframe

COLLEGE_DATA_FILE = './data/College.csv'

college_features = pd.read_csv(COLLEGE_DATA_FILE, delimiter=',')

In [9]:
# Loads college rankings (from USNEWS) into a data frame
## Only Universities-may add liberal arts schools later

RANKING_DATA_FILE = 'data/US-News-Rankings.xlsx'

# The year of rankings you want to use
YEAR = 2022

college_rankings = pd.read_excel(RANKING_DATA_FILE)

# Drops all data besides the name of the university and its ranking in 2022
college_rankings = college_rankings[['University Name', YEAR]]

## Combining Datasets to a Master Dataframe

In [54]:
# Master dataframe
college_data = pd.DataFrame()

# Creates a list of college names in the feature dataset
full_name_list = list(college_features['Name'])

for index, row in college_rankings.iterrows():
    name = row['University Name']
    
    try:
        if not isnan(row[YEAR]):
            # Finds the index of the current university in the feature dataframe
            feature_index = college_features[college_features.isin([name])].stack().index[0][0]

            rank = {'Rank': row[YEAR]}
            features = dict(college_features.iloc[feature_index])

            # Combines USNEWS rank and university features to a dictionary
            rank.update(features)
            new_dict = rank

            # Converts dictionary to a dataframe and appends it to the master dataframe
            new_row = pd.DataFrame([new_dict])
            college_data = pd.concat([college_data, new_row],  axis=0)

    # If name of university from ranking dataset is not in feature dataset
    except IndexError as e:
        pass

# Fixes indices for master dataframe
college_data = college_data.reset_index()
college_data = college_data.drop(columns=['index'])

# Encodes private column into numeric 
college_data['Private'] = college_data['Private'].replace(['Yes', 'No'], [1, 0])

# Sorts colleges by rank and makes their index the new rank, removing all ties such that num of colleges = highest rank
college_data = college_data.sort_values(by=["Rank"])
college_data = college_data.reset_index()

college_data.reset_index(inplace=True)

college_data = college_data.drop(columns=['index', 'Rank'])
college_data.rename(columns = {'level_0':'Rank'}, inplace = True)

In [55]:
# Save final dataframe to disk
college_data.to_csv('data/college_data', index=False)

In [2]:
# Load dataframe from disk
college_data = pd.read_csv('data/college_data', delimiter=',')

## Training Linear Regression Model

In [56]:
# y is the rank of the college while x is the encoded features of the college

y_train = college_data['Rank']
x_train = college_data.drop(columns=['Name', 'Rank'])

y_train = y_train.to_numpy()
x_train = x_train.to_numpy()

In [57]:
# Trains the model with normalized values in [0, 1]
model = make_pipeline(preprocessing.MinMaxScaler(), LinearRegression())

model.fit(x_train, y_train)

r_squared = model.score(x_train, y_train)

## Predicting with Model

In [59]:
def prepare_x(index):
    df = college_data.drop(columns=['Name', 'Rank'])
    x = df.loc[index]

    return x

In [65]:
# Will predict the ranking of the university at the second index in college_data
x = prepare_x(2)
model.predict([x])

array([2.87769133])

## Saving and Loading Model 

In [66]:
# Save the model to disk
MODEL_FILE = 'model.sav'

pickle.dump(model, open(MODEL_FILE, 'wb'))

In [3]:
# Load the model from disk
MODEL_FILE = 'model.sav'

model = pickle.load(open(MODEL_FILE, 'rb'))