In [54]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset from a CSV file
file_path = "Data Gathering Survey (Responses).csv"
df = pd.read_csv(file_path)

print("Which university are you in?" in df.columns)
print(df['Which university are you in?'].head())

missing_cols = [col for col in categorical_features if col not in df.columns]
if missing_cols:
    print(f"Warning: The following columns are missing: {missing_cols}")


# Encode categorical variables
categorical_features = [
    "What is your gender?", 
    "What is your Nationality?", 
    "Are you a domestic or International Student?", 
    "What is your highest academic qualification before joining university?",
    "How would you describe your personality?", 
    "Are you involved in any leadership roles (e.g., student council, club president)?",
    "What type of extracurricular activities do you prioritize?",
    "What sources did you rely on to fund your university education?",
    "What was your GPA/Grade of your highest qualification? (E.g. 3.6/4.0 GPA, 70/90 Rank Points)"
]

numerical_features = [
    "How many hours per week do you dedicate to extracurricular activities?",
    "How significant was the availability of financial aid or scholarships in your university choice?"
]

# Prepare X and y
X = df[categorical_features + numerical_features]  # Select relevant columns
y = df['Which university are you in?']  # Assuming this is your target variable

# Encode categorical features and scale numerical ones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the data using the preprocessor
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Encode target variable
y_encoder = LabelEncoder()
y_train = y_encoder.fit_transform(y_train)
y_test = y_encoder.transform(y_test)

# Define neural network model
model = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(len(np.unique(y_train)), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}')

True
0    National University of Singapore (NUS)
1    National University of Singapore (NUS)
2    Nanyang Technological University (NTU)
3     Singapore Management University (SMU)
4    Nanyang Technological University (NTU)
Name: Which university are you in?, dtype: object
Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.5000


In [61]:
# Function to predict university from user input
def predict_university(user_input):
    # Create a DataFrame for the user input with the correct columns order
    user_df = pd.DataFrame([user_input])
    
    # Preprocess the data using the same preprocessor pipeline
    user_processed = preprocessor.transform(user_df)
    
    # Get prediction probabilities
    prediction_probs = model.predict(user_processed)
    
    # Get indices of top 3 predicted universities (highest probabilities)
    top_3_indices = prediction_probs[0].argsort()[-3:][::-1]
    
    # Get the top 3 predicted universities
    top_3_universities = y_encoder.inverse_transform(top_3_indices)
    
    # Get the corresponding probabilities for the top 3 predictions
    top_3_scores = prediction_probs[0][top_3_indices]
    
    # Return top 3 predicted universities along with their scores
    return list(zip(top_3_universities, top_3_scores))

# Example user input (modify accordingly)
user_input_example = {
    "What is your gender?": "Male",
    "What is your Nationality?": "Singaporean",
    "Are you a domestic or International Student?": "Domestic",
    "What is your highest academic qualification before joining university?": "A-Level",
    "What was your GPA/Grade of your highest qualification? (E.g. 3.6/4.0 GPA, 70/90 Rank Points)": "3.6/4.0",
    "How many hours per week do you dedicate to extracurricular activities?": 0,
    "How significant was the availability of financial aid or scholarships in your university choice?": 10,
    "How would you describe your personality?": "Extrovert",
    "Are you involved in any leadership roles (e.g., student council, club president)?": "Yes",
    "What type of extracurricular activities do you prioritize?": "Sports",
    "What sources did you rely on to fund your university education?": "Scholarships"
}

# Get prediction
predicted_universities = predict_university(user_input_example)
print("Top 3 predicted universities:")
for uni, score in predicted_universities:
    print(f"{uni}: {score:.4f}")


Top 3 predicted universities:
Singapore Management University (SMU): 0.4782
Singapore Institute of Technology (SIT): 0.3557
Nanyang Technological University (NTU): 0.0759
