#server


In [None]:
!pip install pyngrok flask_cors

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting flask_cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Installing collected packages: pyngrok, flask_cors
Successfully installed flask_cors-6.0.1 pyngrok-7.3.0


In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

# -------------------------------------------------
# 1. Define Districts, Streams, Government Univ., and Course Mappings
# -------------------------------------------------

# All 25 Sri Lankan districts
districts = [
    "Colombo", "Gampaha", "Kalutara", "Kandy", "Matale", "Nuwara Eliya",
    "Galle", "Matara", "Hambantota", "Jaffna", "Kilinochchi", "Mannar",
    "Vavuniya", "Mullaitivu", "Batticaloa", "Ampara", "Trincomalee",
    "Kurunegala", "Puttalam", "Anuradhapura", "Polonnaruwa", "Badulla",
    "Monaragala", "Ratnapura", "Kegalle"
]

# A-Level streams
streams = ["Physical Science", "Biological Science", "Commerce", "Arts", "Technology"]

# List of government universities (7 universities)
gov_universities = [
    "University of Colombo",
    "University of Peradeniya",
    "University of Moratuwa",
    "University of Sri Jayewardenepura",
    "University of Kelaniya",
    "University of Ruhuna",
    "University of Jaffna"
]

# Mapping for government universities: for each university, assign a list of courses per stream.
gov_univ_course_mapping = {
    "University of Colombo": {
        "Physical Science": ["BSc in Software Engineering", "BSc in Mechanical Engineering"],
        "Biological Science": ["BSc in Computer Science", "BSc in Environmental Science"],
        "Commerce": ["BSc in Business Analytics", "BSc in Finance"],
        "Arts": ["BA in Economics", "BA in History"],
        "Technology": ["BSc in Information Technology", "BSc in Computer Networking"]
    },
    "University of Peradeniya": {
        "Physical Science": ["BSc in Electrical Engineering", "BSc in Civil Engineering"],
        "Biological Science": ["BSc in Biotechnology", "BSc in Life Sciences"],
        "Commerce": ["BSc in Accounting", "BSc in Management"],
        "Arts": ["BA in Literature", "BA in Political Science"],
        "Technology": ["BSc in Chemical Engineering", "BSc in Environmental Engineering"]
    },
    "University of Moratuwa": {
        "Physical Science": ["BSc in Computer Engineering", "BSc in Industrial Engineering"],
        "Biological Science": ["BSc in Bioengineering", "BSc in Medical Imaging"],
        "Commerce": ["BSc in Marketing", "BSc in MIS"],
        "Arts": ["BA in Architecture", "BA in Design"],
        "Technology": ["BSc in Information Technology", "BSc in Software Engineering"]
    },
    "University of Sri Jayewardenepura": {
        "Physical Science": ["BSc in Chemistry", "BSc in Physics"],
        "Biological Science": ["BSc in Biology", "BSc in Biochemistry"],
        "Commerce": ["BSc in Economics", "BSc in Business Studies"],
        "Arts": ["BA in Sociology", "BA in Psychology"],
        "Technology": ["BSc in Information Systems", "BSc in Computer Science"]
    },
    "University of Kelaniya": {
        "Physical Science": ["BSc in Physics", "BSc in Mathematics"],
        "Biological Science": ["BSc in Environmental Studies", "BSc in Agricultural Science"],
        "Commerce": ["BSc in Tourism Management", "BSc in Hospitality Management"],
        "Arts": ["BA in Fine Arts", "BA in Performing Arts"],
        "Technology": ["BSc in Software Engineering", "BSc in Information Technology"]
    },
    "University of Ruhuna": {
        "Physical Science": ["BSc in Marine Engineering", "BSc in Geoscience"],
        "Biological Science": ["BSc in Fisheries Science", "BSc in Environmental Management"],
        "Commerce": ["BSc in Business Administration", "BSc in Logistics"],
        "Arts": ["BA in Cultural Studies", "BA in Mass Communication"],
        "Technology": ["BSc in Computer Science", "BSc in Electronics Engineering"]
    },
    "University of Jaffna": {
        "Physical Science": ["BSc in Civil Engineering", "BSc in Architecture"],
        "Biological Science": ["BSc in Biotechnology", "BSc in Zoology"],
        "Commerce": ["BSc in Economics", "BSc in Accountancy"],
        "Arts": ["BA in Tamil Studies", "BA in History"],
        "Technology": ["BSc in Information Technology", "BSc in Software Engineering"]
    }
}

# Deterministic assignment of government university based on Z_score
def assign_gov_university(z):
    if z > 2.7:
        return "University of Moratuwa"
    elif z > 2.3:
        return "University of Colombo"
    elif z > 1.9:
        return "University of Peradeniya"
    elif z > 1.5:
        return "University of Sri Jayewardenepura"
    elif z > 1.1:
        return "University of Kelaniya"
    elif z > 0.8:
        return "University of Ruhuna"
    else:
        return "University of Jaffna"

# -------------------------------------------------
# 2. Generate the Synthetic Dataset for Government Univ. (1000 Rows)
# -------------------------------------------------
num_rows = 1000
np.random.seed(42)
random.seed(42)

z_scores = np.round(np.random.uniform(0.5, 3.0, num_rows), 2)
district_choices = [random.choice(districts) for _ in range(num_rows)]
stream_choices = [random.choice(streams) for _ in range(num_rows)]

assigned_gov_universities = [assign_gov_university(z) for z in z_scores]
assigned_gov_courses = [
    random.choice(gov_univ_course_mapping[uni][stream])
    for uni, stream in zip(assigned_gov_universities, stream_choices)
]

gov_data = {
    "Z_score": z_scores,
    "District": district_choices,
    "Stream": stream_choices,
    "Selected_University": assigned_gov_universities,
    "Course": assigned_gov_courses
}
gov_df = pd.DataFrame(gov_data)

# Save government synthetic dataset to CSV
gov_dataset_filename = "government_universities.csv"
gov_df.to_csv(gov_dataset_filename, index=False)
print(f"Government dataset saved to {gov_dataset_filename}")
print("Sample of Government Synthetic Dataset:")
print(gov_df.head(10))

# Create composite target for government data
gov_df["Target"] = gov_df["Selected_University"] + "|" + gov_df["Course"]

# Encode features for government model training
le_district = LabelEncoder()
le_stream = LabelEncoder()
le_target = LabelEncoder()

gov_df["District_Enc"] = le_district.fit_transform(gov_df["District"])
gov_df["Stream_Enc"] = le_stream.fit_transform(gov_df["Stream"])
gov_df["Target_Enc"] = le_target.fit_transform(gov_df["Target"])

X = gov_df[["Z_score", "District_Enc", "Stream_Enc"]]
y = gov_df["Target_Enc"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(f"\nGovernment Model accuracy on test set: {accuracy:.2f}")

# Save the government model and encoders as a .pkl file
gov_model_filename = "university_selection_model.pkl"
joblib.dump({
    "model": clf,
    "le_district": le_district,
    "le_stream": le_stream,
    "le_target": le_target
}, gov_model_filename)
print(f"\nGovernment model saved to {gov_model_filename}")

# -------------------------------------------------
# 3. Example Inference Using the Saved Government Model
# -------------------------------------------------
test_z = 2.3
test_district = "Kandy"
test_stream = "Physical Science"

X_new = pd.DataFrame({
    "Z_score": [test_z],
    "District_Enc": le_district.transform([test_district]),
    "Stream_Enc": le_stream.transform([test_stream])
})

pred_enc = clf.predict(X_new)[0]
pred_composite = le_target.inverse_transform([pred_enc])[0]
gov_univ_pred, gov_course_pred = pred_composite.split("|")
print(f"\n[Government] Predicted University: {gov_univ_pred}")
print(f"[Government] Predicted Course: {gov_course_pred}")




Government dataset saved to government_universities.csv
Sample of Government Synthetic Dataset:
   Z_score     District              Stream       Selected_University  \
0     1.44  Polonnaruwa  Biological Science    University of Kelaniya   
1     2.88        Kandy            Commerce    University of Moratuwa   
2     2.33      Colombo    Physical Science     University of Colombo   
3     2.00    Ratnapura          Technology  University of Peradeniya   
4     0.89   Hambantota  Biological Science      University of Ruhuna   
5     0.89       Matara            Commerce      University of Ruhuna   
6     0.65       Matara    Physical Science      University of Jaffna   
7     2.67       Matale    Physical Science     University of Colombo   
8     2.00    Ratnapura          Technology  University of Peradeniya   
9     2.27        Kandy            Commerce  University of Peradeniya   

                            Course  
0     BSc in Environmental Studies  
1                       BS

In [3]:
import pandas as pd
import numpy as np
import random

import joblib

# -------------------------------------------------
# 1. Load Data & Models
# -------------------------------------------------
gov_model_data = joblib.load("university_selection_model.pkl")
gov_model = gov_model_data["model"]
le_district = gov_model_data["le_district"]
le_stream = gov_model_data["le_stream"]
le_target = gov_model_data["le_target"]

# Load the career dataset for skill matching
career_df = pd.read_csv("Career Dataset.csv")
career_df["Career"] = career_df["Career"].fillna("").astype(str)
career_df["Skill"] = career_df["Skill"].fillna("").astype(str)

# Load the private universities
private_unis = pd.read_csv("private_universities.csv")
private_unis["Relevant_Field"] = private_unis["Relevant_Field"].fillna("").astype(str)

# Load the government universities
gov_unis_df = pd.read_csv("government_universities.csv")
gov_unis_df["Stream"] = gov_unis_df["Stream"].fillna("").astype(str)

In [7]:
import pandas as pd
import numpy as np
import random
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer, util

reloaded = SentenceTransformer("./model")
test = reloaded.encode("I want to study computer science")

skill_embeddings = reloaded.encode(career_df["Skill"].tolist(), convert_to_tensor=True)


private_fields = private_unis["Relevant_Field"].tolist()
private_field_embeddings = reloaded.encode(private_fields, convert_to_tensor=True)

# ===============================
# 2. Create Flask App
# ===============================
app = Flask(__name__)
CORS(app)

@app.route('/')
def index():
    return "Flask with pyngrok is running!"

@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.json or {}
    user_skill_input = data.get("skill_input", "").strip()
    if not user_skill_input:
        return jsonify({"error": "No skill_input provided"}), 400

    # Perform semantic search on career_df
    query_embedding = reloaded.encode(user_skill_input, convert_to_tensor=True)
    top_k = 5
    search_results = util.semantic_search(query_embedding, skill_embeddings, top_k=top_k)[0]

    if not search_results:
        # fallback to first row
        fallback_career = career_df.iloc[0]
        matched_career = fallback_career["Career"]
        matched_skill_text = fallback_career["Skill"]
        matched_score = 0.0
    else:
        top_match = search_results[0]
        matched_career = career_df.iloc[top_match['corpus_id']]['Career']
        matched_skill_text = career_df.iloc[top_match['corpus_id']]['Skill']
        matched_score = top_match["score"]

    response = {
        "matched_skill_text": matched_skill_text,
        "matched_career": matched_career,
        "matched_score": round(float(matched_score), 2)
    }

    # ================
    # Private Recs
    # ================
    private_field_embeddings = reloaded.encode(private_unis["Relevant_Field"].fillna("").tolist(), convert_to_tensor=True)
    private_results = util.semantic_search(query_embedding, private_field_embeddings, top_k=len(private_unis))[0]

    if not private_results:
        fallback_row = private_unis.iloc[0]
        private_recs = [{
            "University": fallback_row["University"],
            "Degree": fallback_row["Degree"],
            "Relevant_Field": fallback_row["Relevant_Field"],
            "Link": fallback_row["Link"],
            "Similarity_Score": 0.0
        }]
    else:
        top_private = private_results[:3]
        private_recs = []
        for res in top_private:
            row_idx = res["corpus_id"]
            score_val = res["score"]
            row_data = private_unis.iloc[row_idx]
            private_recs.append({
                "University": row_data["University"],
                "Degree": row_data["Degree"],
                "Relevant_Field": row_data["Relevant_Field"],
                "Link": row_data["Link"],
                "Similarity_Score": round(float(score_val), 2)
            })

    response["private_universities"] = private_recs

    # ================
    # Government Recs
    # ================
    gov_interest = data.get("gov_interest", "no").lower()
    if gov_interest in ["yes", "y"]:
        user_z = data.get("z_score", None)
        if user_z is None:
            response["government_universities"] = [{"error": "No z_score provided."}]
        else:
            # Filter by z_score
            filtered_gov = gov_unis_df[gov_unis_df["Z_score"] <= float(user_z)].copy()
            if filtered_gov.empty:
                fallback_gov = gov_unis_df.iloc[0]
                response["government_universities"] = [{
                    "University": fallback_gov["Selected_University"],
                    "Degree": fallback_gov["Course"],
                    "Z_score_Program": fallback_gov["Z_score"],
                    "Stream": fallback_gov["Stream"],
                    "Fallback": True
                }]
            else:
                gov_fields = filtered_gov["Stream"].tolist()
                gov_field_embeddings = reloaded.encode(gov_fields, convert_to_tensor=True)
                gov_results = util.semantic_search(query_embedding, gov_field_embeddings, top_k=len(filtered_gov))[0]
                if not gov_results:
                    fallback_gov = filtered_gov.iloc[0]
                    response["government_universities"] = [{
                        "University": fallback_gov["Selected_University"],
                        "Degree": fallback_gov["Course"],
                        "Z_score_Program": fallback_gov["Z_score"],
                        "Stream": fallback_gov["Stream"],
                        "Fallback": True
                    }]
                else:
                    top_gov_sem = gov_results[:3]
                    gov_recs = []
                    for res in top_gov_sem:
                        row_idx = res["corpus_id"]
                        score_val = res["score"]
                        row_data = filtered_gov.iloc[row_idx]
                        gov_recs.append({
                            "University": row_data["Selected_University"],
                            "Degree": row_data["Course"],
                            "Z_score_Program": float(row_data["Z_score"]),
                            "Stream": row_data["Stream"],
                            "Similarity_Score": round(float(score_val), 2)
                        })
                    response["government_universities"] = gov_recs

    return jsonify(response), 200



# 1) If you have an ngrok auth token, set it:

# 3) Run the app on port 5000
app.run(port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [08/Sep/2025 15:40:38] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Sep/2025 15:40:38] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [08/Sep/2025 15:41:56] "OPTIONS /recommend HTTP/1.1" 200 -
127.0.0.1 - - [08/Sep/2025 15:41:57] "POST /recommend HTTP/1.1" 200 -
