<a href="https://colab.research.google.com/github/AshikaKannan/AshikaKannan/blob/main/College_Recommendation_Model_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Generate Synthetic Student Data
def generate_synthetic_students(num_students=200):
    np.random.seed(42)  # For reproducibility
    students = pd.DataFrame({
        'student_id': range(1, num_students + 1),
        'IELTS': np.random.randint(5, 9, num_students),
        'GRE': np.random.randint(300, 340, num_students),
        'TOEFL': np.random.randint(80, 120, num_students),
        'GPA': np.round(np.random.uniform(2.5, 4.0, num_students), 2),
        'Financial_Affordability': np.random.choice(['Low', 'Medium', 'High'], num_students),
        'Preferred_Courses': np.random.choice(['Engineering', 'Business', 'Arts', 'Science'], num_students),
        'Scholarship_Availability': np.random.choice(['Yes', 'No'], num_students)
    })
    return students

# Generate Synthetic College Data
def generate_synthetic_colleges(num_colleges=30):
    np.random.seed(24)  # Different seed for diversity
    colleges = pd.DataFrame({
        'college_id': range(1, num_colleges + 1),
        'college_name': [f'College_{i}' for i in range(1, num_colleges + 1)],
        'Minimum_IELTS': np.random.randint(5, 7, num_colleges),
        'Minimum_GRE': np.random.randint(300, 320, num_colleges),
        'Minimum_TOEFL': np.random.randint(80, 100, num_colleges),
        'Minimum_GPA': np.round(np.random.uniform(3.0, 3.5, num_colleges), 2),
        'Available_Scholarships': np.random.choice(['Yes', 'No'], num_colleges),
        'Available_Courses': np.random.choice(['Engineering', 'Business', 'Arts', 'Science'], num_colleges),
        'Financial_Affordability_Rating': np.random.randint(1, 5, num_colleges)  # 1 (Low) to 4 (High)
    })
    return colleges

# Create Synthetic Data
students = generate_synthetic_students(num_students=200)  # Increased to 200 for better diversity
colleges = generate_synthetic_colleges(num_colleges=30)  # Increased to 30 colleges

# Display Sample Data
print("Sample Student Data:")
print(students.head())
print("\nSample College Data:")
print(colleges.head())

# 2. Data Merging and Feature Engineering

# Cross join students and colleges
students['key'] = 1
colleges['key'] = 1
student_college = pd.merge(students, colleges, on='key').drop('key', axis=1)

# Feature Engineering: Check if student meets college minimum criteria
student_college['Meets_IELTS'] = student_college['IELTS'] >= student_college['Minimum_IELTS']
student_college['Meets_GRE'] = student_college['GRE'] >= student_college['Minimum_GRE']
student_college['Meets_TOEFL'] = student_college['TOEFL'] >= student_college['Minimum_TOEFL']
student_college['Meets_GPA'] = student_college['GPA'] >= student_college['Minimum_GPA']
student_college['Meets_Scholarship'] = (
    (student_college['Scholarship_Availability'] == 'Yes') &
    (student_college['Available_Scholarships'] == 'Yes')
)
student_college['Meets_Course'] = student_college['Preferred_Courses'] == student_college['Available_Courses']


Sample Student Data:
   student_id  IELTS  GRE  TOEFL   GPA Financial_Affordability  \
0           1      7  331     88  3.76                    High   
1           2      8  338    118  3.80                  Medium   
2           3      5  331    108  3.76                    High   
3           4      7  303    105  3.14                    High   
4           5      7  329    114  2.83                     Low   

  Preferred_Courses Scholarship_Availability  
0       Engineering                      Yes  
1       Engineering                      Yes  
2       Engineering                       No  
3       Engineering                       No  
4              Arts                       No  

Sample College Data:
   college_id college_name  Minimum_IELTS  Minimum_GRE  Minimum_TOEFL  \
0           1    College_1              5          312             84   
1           2    College_2              6          301             93   
2           3    College_3              5          301     

In [6]:
# Target Variable: Acceptance (for synthetic purposes, adjusted to prevent probabilities >1)
def assign_acceptance(row):
    base_prob = 0.2  # Base acceptance probability
    if row['Meets_IELTS']:
        base_prob += 0.1
    if row['Meets_GRE']:
        base_prob += 0.1
    if row['Meets_TOEFL']:
        base_prob += 0.1
    if row['Meets_GPA']:
        base_prob += 0.2
    if row['Meets_Scholarship']:
        base_prob += 0.1
    if row['Meets_Course']:
        base_prob += 0.3
    base_prob = min(base_prob, 0.95)  # Cap the probability to prevent exceeding 1
    return np.random.choice([0,1], p=[1 - base_prob, base_prob])

student_college['Acceptance'] = student_college.apply(assign_acceptance, axis=1)

# Display Sample Merged Data
print("\nSample Merged Student-College Data:")
print(student_college.head())



Sample Merged Student-College Data:
   student_id  IELTS  GRE  TOEFL   GPA Financial_Affordability  \
0           1      7  331     88  3.76                    High   
1           1      7  331     88  3.76                    High   
2           1      7  331     88  3.76                    High   
3           1      7  331     88  3.76                    High   
4           1      7  331     88  3.76                    High   

  Preferred_Courses Scholarship_Availability  college_id college_name  ...  \
0       Engineering                      Yes           1    College_1  ...   
1       Engineering                      Yes           2    College_2  ...   
2       Engineering                      Yes           3    College_3  ...   
3       Engineering                      Yes           4    College_4  ...   
4       Engineering                      Yes           5    College_5  ...   

   Available_Scholarships  Available_Courses  Financial_Affordability_Rating  \
0                

In [7]:
# 3. Data Preprocessing

# Select Features
features = [
    'IELTS', 'GRE', 'TOEFL', 'GPA',
    'Financial_Affordability', 'Preferred_Courses',
    'Scholarship_Availability', 'Minimum_IELTS',
    'Minimum_GRE', 'Minimum_TOEFL', 'Minimum_GPA',
    'Available_Scholarships', 'Available_Courses',
    'Financial_Affordability_Rating'
]

X = student_college[features]
y = student_college['Acceptance']

# Encode Categorical Variables using One-Hot Encoding
categorical_features = [
    'Financial_Affordability', 'Preferred_Courses',
    'Scholarship_Availability', 'Available_Scholarships', 'Available_Courses'
]

X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Feature Scaling for Numerical Features
scaler = StandardScaler()
numerical_features = [
    'IELTS', 'GRE', 'TOEFL', 'GPA',
    'Minimum_IELTS', 'Minimum_GRE', 'Minimum_TOEFL',
    'Minimum_GPA', 'Financial_Affordability_Rating'
]
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Display Preprocessed Features
print("\nPreprocessed Features:")
print(X.head())



Preprocessed Features:
      IELTS       GRE     TOEFL      GPA  Minimum_IELTS  Minimum_GRE  \
0  0.376625  0.832089 -1.122877  1.23501      -1.658312     0.917317   
1  0.376625  0.832089 -1.122877  1.23501       0.603023    -1.048362   
2  0.376625  0.832089 -1.122877  1.23501      -1.658312    -1.048362   
3  0.376625  0.832089 -1.122877  1.23501       0.603023    -0.154872   
4  0.376625  0.832089 -1.122877  1.23501       0.603023    -1.048362   

   Minimum_TOEFL  Minimum_GPA  Financial_Affordability_Rating  \
0      -0.753228    -0.453331                        1.240347   
1       1.202267     0.124025                       -1.550434   
2      -1.622337     1.214585                       -0.620174   
3      -1.187782    -1.223138                       -1.550434   
4       0.767713    -1.479741                       -0.620174   

   Financial_Affordability_Low  Financial_Affordability_Medium  \
0                        False                           False   
1                   

In [8]:
# 4. Model Training and Evaluation

# Split the data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize the Model (Random Forest Classifier)
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the Model
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

# Evaluate the Model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.30      0.36       436
           1       0.66      0.79      0.72       764

    accuracy                           0.61      1200
   macro avg       0.55      0.54      0.54      1200
weighted avg       0.58      0.61      0.59      1200

AUC-ROC Score: 0.6049221864642874


In [9]:
# 5. Recommendation Function

def recommend_colleges(student_profile, top_n=5, specific_college=None):
    """
    Recommends colleges based on the student's profile.

    Parameters:
    - student_profile (dict): Dictionary containing student's exam scores and preferences.
    - top_n (int): Number of top recommendations to return.
    - specific_college (str): If provided, filters recommendations to this college.

    Returns:
    - DataFrame with recommended colleges and acceptance probabilities.
    """
    # Convert student_profile to DataFrame
    student_df = pd.DataFrame([student_profile])

    # Cross join with colleges
    student_df['key'] = 1
    colleges_filtered = colleges.copy()

    if specific_college:
        colleges_filtered = colleges_filtered[colleges_filtered['college_name'] == specific_college]

    student_college_input = pd.merge(student_df, colleges_filtered, on='key').drop('key', axis=1)

    # Feature Engineering: Check if student meets college minimum criteria
    student_college_input['Meets_IELTS'] = student_college_input['IELTS'] >= student_college_input['Minimum_IELTS']
    student_college_input['Meets_GRE'] = student_college_input['GRE'] >= student_college_input['Minimum_GRE']
    student_college_input['Meets_TOEFL'] = student_college_input['TOEFL'] >= student_college_input['Minimum_TOEFL']
    student_college_input['Meets_GPA'] = student_college_input['GPA'] >= student_college_input['Minimum_GPA']
    student_college_input['Meets_Scholarship'] = (
        (student_college_input['Scholarship_Availability'] == 'Yes') &
        (student_college_input['Available_Scholarships'] == 'Yes')
    )
    student_college_input['Meets_Course'] = student_college_input['Preferred_Courses'] == student_college_input['Available_Courses']

    # Select Features for Prediction
    input_features = [
        'IELTS', 'GRE', 'TOEFL', 'GPA',
        'Financial_Affordability', 'Preferred_Courses',
        'Scholarship_Availability', 'Minimum_IELTS',
        'Minimum_GRE', 'Minimum_TOEFL', 'Minimum_GPA',
        'Available_Scholarships', 'Available_Courses',
        'Financial_Affordability_Rating'
    ]

    X_input = student_college_input[input_features]

    # Encode Categorical Variables using One-Hot Encoding
    X_input = pd.get_dummies(X_input, columns=categorical_features, drop_first=True)

    # Align the input features with the training features
    X_input = X_input.reindex(columns=X.columns, fill_value=0)

    # Feature Scaling for Numerical Features
    X_input[numerical_features] = scaler.transform(X_input[numerical_features])

    # Predict Acceptance Probabilities
    probabilities = model.predict_proba(X_input)[:,1]
    student_college_input['Acceptance_Probability'] = probabilities

    # Sort and Select Top N Recommendations
    recommendations = student_college_input.sort_values(by='Acceptance_Probability', ascending=False).head(top_n)

    # Select Relevant Columns
    recommendations = recommendations[['college_name', 'Acceptance_Probability']]

    return recommendations.reset_index(drop=True)


In [11]:
# 6. Demonstration of the Recommendation Function

# Example Student Profile
example_student = {
    'IELTS': 9,
    'GRE': 350,
    'TOEFL': 120,
    'GPA': 8.5,
    'Financial_Affordability': 'Medium',
    'Preferred_Courses': 'Engineering',
    'Scholarship_Availability': 'Yes'
}

# Get Top 5 Recommendations
top_recommendations = recommend_colleges(example_student, top_n=5)
print("\nTop 5 College Recommendations:")
print(top_recommendations)

# Get Recommendations for a Specific College
specific_college_recommendation = recommend_colleges(example_student, specific_college='College_10')
print("\nRecommendation for College_10:")
print(specific_college_recommendation)



Top 5 College Recommendations:
  college_name  Acceptance_Probability
0   College_26                    0.93
1   College_20                    0.93
2   College_10                    0.89
3   College_25                    0.87
4   College_23                    0.87

Recommendation for College_10:
  college_name  Acceptance_Probability
0   College_10                    0.88


In [12]:
# 7. Additional Functionality: Financial and Educational Acceptance Probabilities

def recommend_colleges_detailed(student_profile, top_n=5, specific_college=None):
    """
    Recommends colleges with detailed probabilities based on the student's profile.

    Parameters:
    - student_profile (dict): Dictionary containing student's exam scores and preferences.
    - top_n (int): Number of top recommendations to return.
    - specific_college (str): If provided, filters recommendations to this college.

    Returns:
    - DataFrame with recommended colleges and detailed probabilities.
    """
    # Convert student_profile to DataFrame
    student_df = pd.DataFrame([student_profile])

    # Cross join with colleges
    student_df['key'] = 1
    colleges_filtered = colleges.copy()

    if specific_college:
        colleges_filtered = colleges_filtered[colleges_filtered['college_name'] == specific_college]

    student_college_input = pd.merge(student_df, colleges_filtered, on='key').drop('key', axis=1)

    # Feature Engineering: Check if student meets college minimum criteria
    student_college_input['Meets_IELTS'] = student_college_input['IELTS'] >= student_college_input['Minimum_IELTS']
    student_college_input['Meets_GRE'] = student_college_input['GRE'] >= student_college_input['Minimum_GRE']
    student_college_input['Meets_TOEFL'] = student_college_input['TOEFL'] >= student_college_input['Minimum_TOEFL']
    student_college_input['Meets_GPA'] = student_college_input['GPA'] >= student_college_input['Minimum_GPA']
    student_college_input['Meets_Scholarship'] = (
        (student_college_input['Scholarship_Availability'] == 'Yes') &
        (student_college_input['Available_Scholarships'] == 'Yes')
    )
    student_college_input['Meets_Course'] = student_college_input['Preferred_Courses'] == student_college_input['Available_Courses']

    # Select Features for Prediction
    input_features = [
        'IELTS', 'GRE', 'TOEFL', 'GPA',
        'Financial_Affordability', 'Preferred_Courses',
        'Scholarship_Availability', 'Minimum_IELTS',
        'Minimum_GRE', 'Minimum_TOEFL', 'Minimum_GPA',
        'Available_Scholarships', 'Available_Courses',
        'Financial_Affordability_Rating'
    ]

    X_input = student_college_input[input_features]

    # Encode Categorical Variables using One-Hot Encoding
    X_input = pd.get_dummies(X_input, columns=categorical_features, drop_first=True)

    # Align the input features with the training features
    X_input = X_input.reindex(columns=X.columns, fill_value=0)

    # Feature Scaling for Numerical Features
    X_input[numerical_features] = scaler.transform(X_input[numerical_features])

    # Predict Acceptance Probabilities
    probabilities = model.predict_proba(X_input)[:,1]
    student_college_input['Acceptance_Probability'] = probabilities

    # For demonstration, let's assume Financial Probability is based on Financial_Affordability_Rating
    # and Educational Probability is based on Acceptance_Probability
    # These can be further refined with more detailed data
    student_college_input['Financial_Probability'] = student_college_input['Financial_Affordability_Rating'] / 4  # Normalize to [0,1]
    student_college_input['Educational_Probability'] = student_college_input['Acceptance_Probability']
    student_college_input['Overall_Probability'] = (student_college_input['Financial_Probability'] +
                                                   student_college_input['Educational_Probability']) / 2

    # Sort and Select Top N Recommendations based on Overall Probability
    recommendations = student_college_input.sort_values(by='Overall_Probability', ascending=False).head(top_n)

    # Select Relevant Columns
    recommendations = recommendations[['college_name', 'Financial_Probability',
                                       'Educational_Probability', 'Overall_Probability']]

    return recommendations.reset_index(drop=True)

# Get Detailed Top 5 Recommendations
detailed_recommendations = recommend_colleges_detailed(example_student, top_n=5)
print("\nTop 5 College Recommendations with Detailed Probabilities:")
print(detailed_recommendations)

# Get Detailed Recommendations for a Specific College
detailed_specific_recommendation = recommend_colleges_detailed(example_student, specific_college='College_10')
print("\nDetailed Recommendation for College_10:")
print(detailed_specific_recommendation)


Top 5 College Recommendations with Detailed Probabilities:
  college_name  Financial_Probability  Educational_Probability  \
0   College_21                    1.0                     0.85   
1   College_18                    1.0                     0.83   
2    College_1                    1.0                     0.80   
3    College_8                    1.0                     0.80   
4   College_28                    1.0                     0.71   

   Overall_Probability  
0                0.925  
1                0.915  
2                0.900  
3                0.900  
4                0.855  

Detailed Recommendation for College_10:
  college_name  Financial_Probability  Educational_Probability  \
0   College_10                    0.5                     0.88   

   Overall_Probability  
0                 0.69  
