In [1]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, make_scorer 



In [2]:

# Load preprocessed dataset
dataset = pd.read_csv('../preprocessing/combined_data_filled.csv')

In [3]:
# Numeric variables for matrix factorization
numeric_vars = ['Average Fee per semester in AUD', 'IELTS overall', 'IELTS reading', 'IELTS listening',
                'IELTS writing', 'IELTS speaking', 'TOEFL IBT overall', 'TOEFL IBT listening',
                'TOEFL IBT reading', 'TOEFL IBT speaking', 'TOEFL IBT writing', 'PTE overall',
                'PTE speaking & writing', 'PTE reading', 'PTE listening', 'minimum IELTS overall',
                'minimum TOEFL IBT overall', 'minimum PTE overall']

In [4]:
# Categorical variables for one-hot encoding
categorical_vars = ['Uni Name', 'State']

In [5]:
# Split dataset into train and test sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [6]:
# Apply matrix factorization (SVD) on numeric variables
numeric_svd = TruncatedSVD(n_components=18, random_state=42)
numeric_factors = numeric_svd.fit_transform(train_data[numeric_vars])


In [7]:
# Apply one-hot encoding on categorical variables
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
categorical_factors = one_hot_encoder.fit_transform(train_data[categorical_vars])



In [8]:
# Combine numeric and categorical factors
combined_factors = pd.concat([pd.DataFrame(numeric_factors), pd.DataFrame(categorical_factors)], axis=1)
print(combined_factors)

              0          1          2          3          4          5   \
0   33400.365325 -12.515928  10.162114  -2.193213  -1.651234  -3.153555   
1   30240.445880  10.646442   0.048547   1.200317  -1.500256   0.403903   
2   33000.414959  -2.329283  -0.371086  -0.185715  -0.572725  -0.600111   
3   31868.186373 -10.293545  -8.655096  -9.042144   2.359315   9.413422   
4   32500.416748  -0.716724   0.709012   0.241605  -0.708053  -0.206635   
5   31868.245444   0.806143   0.307084   3.240619  -6.093123  -0.965328   
6   38432.340033 -30.581399   2.416424  -2.287110   1.361235  -1.842209   
7   31868.279504   8.195445   4.218427   1.327226  -4.147975  -1.498575   
8   26400.471523  24.930660   2.210618   4.155271  -5.742290   2.587574   
9   41040.305919 -43.778544   3.128879  -3.426310   2.292660  -2.601621   
10  33000.391201  -6.934714   3.069422   0.890432  -0.879020  -2.101262   
11  31868.250656   2.403735   0.492167   0.485200  -0.905635  -0.068469   
12  31868.207963  -5.4470

In [9]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(combined_factors)
print(similarity_matrix)


[[1.         0.99999966 0.99999988 0.99999969 0.99999987 0.99999982
  0.99999986 0.99999976 0.99999903 0.99999971 0.99999994 0.99999983
  0.99999984 0.99999786 0.99999984 0.99999429 0.99999983 0.9999995
  0.99999804 0.99999983 0.99999929 0.99999994 0.99999992 0.99999984
  0.99999983 0.99999983 0.9999971  0.99999963 0.99999983 0.99999927]
 [0.99999966 1.         0.99999991 0.99999962 0.99999993 0.99999993
  0.99999933 0.99999997 0.99999979 0.99999897 0.99999983 0.99999996
  0.9999998  0.99999841 0.9999996  0.99999658 0.99999996 0.99999976
  0.99999918 0.99999996 0.99999989 0.99999967 0.99999952 0.99999995
  0.99999996 0.99999996 0.99999866 0.99999993 0.99999996 0.999999  ]
 [0.99999988 0.99999991 1.         0.99999983 1.         0.99999997
  0.99999973 0.99999992 0.99999943 0.99999949 0.99999998 0.99999999
  0.99999995 0.99999798 0.99999987 0.99999539 0.99999999 0.99999966
  0.9999987  0.99999999 0.99999963 0.99999993 0.99999985 0.99999999
  0.99999999 0.99999999 0.99999801 0.9999998  0

In [10]:
sorted(list(enumerate(similarity_matrix[0])), reverse= True, key= lambda x: x[1])[1:6]

[(10, 0.9999999449974507),
 (21, 0.9999999415814274),
 (22, 0.999999916829523),
 (2, 0.9999998760283137),
 (4, 0.9999998738177653)]

In [11]:
# Function to recommend courses for a given course ID
def recommend_uni(uni):
    uni_index= dataset[dataset['Uni Name']== uni].index[0]
    distances= similarity_matrix[uni_index]
    uni_list = sorted(list(enumerate(distances)), reverse= True, key= lambda x: x[1])[1:6]
    c = 1
    for i in uni_list:
        print(f"{c}) {dataset.iloc[i[0]]['Uni Name']}")
        c += 1

In [12]:
# Example usage: Recommend courses for any university
print("The top 5 university recommendations are:")
print("\n")
recommended_courses = recommend_uni('The Australian National University')


The top 5 university recommendations are:


1) University of Technology Sydney
2) University of the Sunshine Coast
3) Flinders University
4) Avondale University
5) Macquarie University


In [13]:
# Test the model
test_numeric_factors = numeric_svd.transform(test_data[numeric_vars])
test_categorical_factors = one_hot_encoder.transform(test_data[categorical_vars])
test_combined_factors = pd.concat([pd.DataFrame(test_numeric_factors), pd.DataFrame(test_categorical_factors)], axis=1)


In [14]:
# Calculate similarity matrix for test data
test_similarity_matrix = cosine_similarity(test_combined_factors)

In [15]:
# Function to recommend courses for a given course ID
def predict_uni(uni):
    uni_index= dataset[dataset['Uni Name']== uni].index[0]
    distances= test_similarity_matrix[uni_index]
    uni_list = sorted(list(enumerate(distances)), reverse= True, key= lambda x: x[1])[1:6]
    
    c = 1
    for i in uni_list:
        print(f"{c}) {dataset.iloc[i[0]]['Uni Name']}")
        c += 1

In [16]:
print("The top 5 university recommendations are:")
print("\n")

predicted_courses = predict_uni('The Australian National University')

The top 5 university recommendations are:


1) University of New England
2) Avondale University
3) University of New South Wales
4) Macquarie University
5) University of Canberra


In [17]:
# Now for the simple user-interface to showcase my project

In [18]:
pickle.dump(dataset.to_dict(), open('dataset.pkl', 'wb'))

In [19]:
pickle.dump(similarity_matrix, open('similarity.pkl', 'wb'))