# ***Load file from google drive***

In [1]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Path to your CSV file
file_path = '/content/drive/My Drive/slt_cv/all_correct.csv'



Mounted at /content/drive


# ***Put data from csv to Dataframe***

In [2]:
# Load the dataset
dataset = pd.read_csv(file_path)

# Display the first few rows
print(dataset.head())



   OL_Passes  OL_Credits OL_Math OL_English OL_Sinhala_Tamil  AL_Passes  \
0          9           7       S          S                C          0   
1          6           2       A          S                F          0   
2          8           3       S          S                A          0   
3          7           3       A          F                S          3   
4          7           4       C          F                F          2   

   fail/pass  
0          0  
1          0  
2          0  
3          0  
4          0  


# ***Check class imbalance***

In [3]:
# Check class balance
print(dataset['fail/pass'].value_counts(normalize=True) * 100)

fail/pass
0    50.0
1    50.0
Name: proportion, dtype: float64


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ***Convert Grade letter to integers for the model to understand***

In [5]:
# Encoding categorical grades
grade_mapping = {'A': 4, 'B': 3, 'C': 2, 'S': 1, 'F': 0}
dataset['OL_Math'] = dataset['OL_Math'].map(grade_mapping)
dataset['OL_English'] = dataset['OL_English'].map(grade_mapping)
dataset['OL_Sinhala_Tamil'] = dataset['OL_Sinhala_Tamil'].map(grade_mapping)


# ***prepare data to feed to the model***

In [6]:
# Splitting data
X = dataset.drop(columns=['fail/pass'])
y = dataset['fail/pass']

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# ***Train model and get testing accuracy***

In [7]:
# Model training
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       1.00      1.00      1.00        47

    accuracy                           1.00        98
   macro avg       1.00      1.00      1.00        98
weighted avg       1.00      1.00      1.00        98



# ***Manul testing for a few records***

In [8]:
# Sample manual test data
test_data = pd.DataFrame({
    "OL_Passes": [6, 5, 8, 7, 6],
    "OL_Credits": [3, 2, 4, 3, 2],
    "OL_Math": ["A", "F", "B", "C", "S"],
    "OL_English": ["B", "S", "A", "C", "F"],
    "OL_Sinhala_Tamil": ["C", "S", "A", "B", "F"],
    "AL_Passes": [3, 2, 3, 1, 3],
    "fail/pass": [1, 0, 1, 0, 0]  # Ground truth for testing
})

# Grade Mapping
grade_mapping = {'A': 4, 'B': 3, 'C': 2, 'S': 1, 'F': 0}
test_data['OL_Math'] = test_data['OL_Math'].map(grade_mapping)
test_data['OL_English'] = test_data['OL_English'].map(grade_mapping)
test_data['OL_Sinhala_Tamil'] = test_data['OL_Sinhala_Tamil'].map(grade_mapping)

# Drop the ground truth column temporarily for predictions
X_test = test_data.drop(columns=["fail/pass"])

# Display data
print(X_test)

   OL_Passes  OL_Credits  OL_Math  OL_English  OL_Sinhala_Tamil  AL_Passes
0          6           3        4           3                 2          3
1          5           2        0           1                 1          2
2          8           4        3           4                 4          3
3          7           3        2           2                 3          1
4          6           2        1           0                 0          3


# ***result was 4/5 was correctn***

In [9]:
# Make predictions
predictions = rf_model.predict(X_test)

# Add predictions to the test data
test_data['predicted_fail/pass'] = predictions

# Compare predictions with ground truth
print(test_data)

   OL_Passes  OL_Credits  OL_Math  OL_English  OL_Sinhala_Tamil  AL_Passes  \
0          6           3        4           3                 2          3   
1          5           2        0           1                 1          2   
2          8           4        3           4                 4          3   
3          7           3        2           2                 3          1   
4          6           2        1           0                 0          3   

   fail/pass  predicted_fail/pass  
0          1                    0  
1          0                    0  
2          1                    1  
3          0                    0  
4          0                    0  


# ***Generate 100 records within our TTO Prmpt for testing with labels***

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score

In [10]:

# Function to generate synthetic data for manual testing
def generate_manual_testing_data(num_records=100):
    data = []
    for _ in range(num_records):
        ol_passes = np.random.randint(4, 10)  # Between 4 to 9
        ol_credits = np.random.randint(0, ol_passes + 1)
        ol_math = np.random.choice([0, 1, 2, 3, 4])  # F (0) to A (4)
        ol_english = np.random.choice([0, 1, 2, 3, 4])  # F (0) to A (4)
        ol_sinhala_tamil = np.random.choice([0, 1, 2, 3, 4])  # F (0) to A (4)
        al_passes = np.random.randint(0, 5)  # Between 0 to 4 passes
        age = np.random.randint(18, 25)  # Random ages between 18 and 24

        # Check pass/fail eligibility based on your criteria
        is_eligible = (
            ol_passes >= 6 and
            ol_credits >= 3 and
            ol_math >= 2 and  # C or better
            ol_english >= 2 and  # C or better
            ol_sinhala_tamil >= 2 and  # C or better
            al_passes >= 3 and
            age > 20
        )

        data.append([ol_passes, ol_credits, ol_math, ol_english, ol_sinhala_tamil, al_passes, 1 if is_eligible else 0])

    columns = ["OL_Passes", "OL_Credits", "OL_Math", "OL_English", "OL_Sinhala_Tamil", "AL_Passes", "fail/pass"]
    return pd.DataFrame(data, columns=columns)



In [13]:
# Generate 100 records
manual_test_data = generate_manual_testing_data(100)

# Prepare X_test and y_test
X_test = manual_test_data.drop(columns=["fail/pass"])
y_test = manual_test_data["fail/pass"]

# ***Train and get Accuracy***

In [18]:
# Make predictions
predictions = rf_model.predict(X_test)

# Add predictions to the manual test data
manual_test_data["predicted_fail/pass"] = predictions

# Calculate and display accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Manual Testing Accuracy: {accuracy * 100:.2f}%")

# Display the manual test data for inspection
print(manual_test_data.head())

Manual Testing Accuracy: 98.00%
   OL_Passes  OL_Credits  OL_Math  OL_English  OL_Sinhala_Tamil  AL_Passes  \
0          5           1        3           0                 3          3   
1          8           7        1           2                 2          1   
2          4           4        0           4                 2          1   
3          4           2        1           0                 4          0   
4          9           0        2           4                 1          1   

   fail/pass  predicted_fail/pass  
0          0                    0  
1          0                    0  
2          0                    0  
3          0                    0  
4          0                    0  


# ***TESTING THE ACTUAL ACCURACY***

In [15]:
import requests
import pandas as pd

# Define the URL
url =

# Fetch data from the server
response = requests.get(url)
if response.status_code == 200:
    records = response.json()  # Assuming JSON response
    print("Data fetched successfully!")
else:
    print("Failed to fetch data:", response.status_code)

# Convert to DataFrame for easier manipulation
df = pd.DataFrame(records)

# Display the first few records to verify
print(df.head())

Data fetched successfully!
                        _id  ApplicationID   ApplicationCode   IntakeCode  \
0  6724a28c1714fb5e223f0237              2  TTO/2024/10/0002  TTO/2024/10   
1  6724a28c1714fb5e223f0238              3  TTO/2024/10/0003  TTO/2024/10   
2  6724a28c1714fb5e223f0239             10  TTO/2024/10/0010  TTO/2024/10   
3  6724a28c1714fb5e223f023a             11  TTO/2024/10/0011  TTO/2024/10   
4  6724a28c1714fb5e223f023b             12  TTO/2024/10/0012  TTO/2024/10   

  Salutation Initials Surname FullName   NIC                     DOB Overage  \
0       None     None    None     None  None   7/27/1994 12:00:00 AM    None   
1       None     None    None     None  None  10/27/1998 12:00:00 AM    None   
2       None     None    None     None  None   8/18/1994 12:00:00 AM    None   
3       None     None    None     None  None   11/3/1999 12:00:00 AM    None   
4       None     None    None     None  None   12/4/1995 12:00:00 AM    None   

  AgeYears AgeMonths AgeDays 

In [16]:
df.head()

Unnamed: 0,_id,ApplicationID,ApplicationCode,IntakeCode,Salutation,Initials,Surname,FullName,NIC,DOB,Overage,AgeYears,AgeMonths,AgeDays,AppliedDate,ol_sittings,al_sittings
0,6724a28c1714fb5e223f0237,2,TTO/2024/10/0002,TTO/2024/10,,,,,,7/27/1994 12:00:00 AM,,,,,10/3/2024 11:14:09 PM,"[{'sitting_number': 1, 'results': {'Buddhism':...","[{'sitting_number': 1, 'results': {'Chemistry'..."
1,6724a28c1714fb5e223f0238,3,TTO/2024/10/0003,TTO/2024/10,,,,,,10/27/1998 12:00:00 AM,,,,,10/3/2024 11:57:29 PM,"[{'sitting_number': 1, 'results': {'Art': 'A',...","[{'sitting_number': 1, 'results': {'Chemistry'..."
2,6724a28c1714fb5e223f0239,10,TTO/2024/10/0010,TTO/2024/10,,,,,,8/18/1994 12:00:00 AM,,,,,10/4/2024 9:57:45 AM,"[{'sitting_number': 1, 'results': {'Buddhism':...","[{'sitting_number': 1, 'results': {'Chemistry'..."
3,6724a28c1714fb5e223f023a,11,TTO/2024/10/0011,TTO/2024/10,,,,,,11/3/1999 12:00:00 AM,,,,,10/4/2024 10:13:39 AM,"[{'sitting_number': 1, 'results': {'Business &...","[{'sitting_number': 1, 'results': {'Combine Ma..."
4,6724a28c1714fb5e223f023b,12,TTO/2024/10/0012,TTO/2024/10,,,,,,12/4/1995 12:00:00 AM,,,,,10/4/2024 10:19:50 AM,"[{'sitting_number': 1, 'results': {'Buddhism':...","[{'sitting_number': 1, 'results': {'Chemistry'..."


# ***Transforming the dataframe to our trained model's format***

In [31]:
import pandas as pd

# Helper function to count passes and credits
def count_passes_and_credits(results):
    grade_to_points = {"A": 4, "B": 3, "C": 2, "S": 1, "F": 0}
    passes = sum(grade_to_points.get(grade, 0) > 0 for grade in results.values())
    credits = sum(grade_to_points.get(grade, 0) >= 2 for grade in results.values())  # Credit = C or higher
    return passes, credits

# Function to extract the required data
def transform_records(df):
    records = []

    for _, row in df.iterrows():
        applicant_id = row["_id"]
        dob = row["DOB"]
        age = 2024 - int(dob.split("/")[2].split(" ")[0])  # Calculate approximate age

        # Process O-Level sittings
        for ol_sitting in row["ol_sittings"]:
            ol_results = ol_sitting["results"]
            ol_passes, ol_credits = count_passes_and_credits(ol_results)

            ol_math = {"A": 4, "B": 3, "C": 2, "S": 1, "F": 0}.get(ol_results.get("Mathematics", "F"), 0)
            ol_english = {"A": 4, "B": 3, "C": 2, "S": 1, "F": 0}.get(ol_results.get("English", "F"), 0)
            ol_sinhala_tamil = {"A": 4, "B": 3, "C": 2, "S": 1, "F": 0}.get(ol_results.get("Sinhala/Tamil", "F"), 0)

            # Process A-Level sittings
            for al_sitting in row["al_sittings"]:
                al_results = al_sitting["results"]
                al_passes, _ = count_passes_and_credits(al_results)

                # Append transformed record
                records.append({
                    "ApplicantID": applicant_id,
                    "Age": age,
                    "OL_Passes": ol_passes,
                    "OL_Credits": ol_credits,
                    "OL_Math": ol_math,
                    "OL_English": ol_english,
                    "OL_Sinhala_Tamil": ol_sinhala_tamil,
                    "AL_Passes": al_passes
                })

    return pd.DataFrame(records)

# Transform the records
transformed_data = transform_records(df)

# Display the transformed records
print(transformed_data.head())

                ApplicantID  Age  OL_Passes  OL_Credits  OL_Math  OL_English  \
0  6724a28c1714fb5e223f0237   30          9           9        4           4   
1  6724a28c1714fb5e223f0238   26          9           9        4           2   
2  6724a28c1714fb5e223f0239   30          9           9        4           3   
3  6724a28c1714fb5e223f023a   25          9           8        4           1   
4  6724a28c1714fb5e223f023a   25          1           1        0           2   

   OL_Sinhala_Tamil  AL_Passes  
0                 4          3  
1                 4          4  
2                 3          3  
3                 3          4  
4                 0          4  


# ***Putting the labels for the real data***

In [32]:
# Function to calculate eligibility
def check_eligibility(row):
    is_eligible = (
        row["OL_Passes"] >= 6 and
        row["OL_Credits"] >= 3 and
        row["OL_Math"] >= 2 and  # C or better
        row["OL_English"] >= 2 and  # C or better
        row["OL_Sinhala_Tamil"] >= 2 and  # C or better
        row["AL_Passes"] >= 3
    )
    return 1 if is_eligible else 0

# Apply the eligibility check
transformed_data["fail/pass"] = transformed_data.apply(check_eligibility, axis=1)

# Display the resulting DataFrame
print(transformed_data)

                   ApplicantID  Age  OL_Passes  OL_Credits  OL_Math  \
0     6724a28c1714fb5e223f0237   30          9           9        4   
1     6724a28c1714fb5e223f0238   26          9           9        4   
2     6724a28c1714fb5e223f0239   30          9           9        4   
3     6724a28c1714fb5e223f023a   25          9           8        4   
4     6724a28c1714fb5e223f023a   25          1           1        0   
...                        ...  ...        ...         ...      ...   
1021  6724a28c1714fb5e223f0566   27          1           1        0   
1022  6724a28c1714fb5e223f0567   22          8           8        2   
1023  6724a28c1714fb5e223f0568   30          9           9        4   
1024  6724a28c1714fb5e223f0569   25          9           9        3   
1025  6724a28c1714fb5e223f056a   27          9           9        4   

      OL_English  OL_Sinhala_Tamil  AL_Passes  fail/pass  
0              4                 4          3          1  
1              2             

# ***Preparing data (droping unwanted columns)***

In [33]:
# Prepare X_test and y_test

# List of columns to drop
columns_to_drop = ["Age", "ApplicantID", "fail/pass"]  # Add any other columns not used during training

# Drop the columns
X_test = transformed_data.drop(columns=columns_to_drop, errors='ignore')

y_test = transformed_data["fail/pass"]

# ***Testing and getting accuracy***

In [34]:
# Make predictions
predictions = rf_model.predict(X_test)

# Add predictions to the manual test data
transformed_data["predicted_fail/pass"] = predictions

# Calculate and display accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Manual Testing Accuracy: {accuracy * 100:.2f}%")

# Display the manual test data for inspection
print(transformed_data.head())

Manual Testing Accuracy: 100.00%
                ApplicantID  Age  OL_Passes  OL_Credits  OL_Math  OL_English  \
0  6724a28c1714fb5e223f0237   30          9           9        4           4   
1  6724a28c1714fb5e223f0238   26          9           9        4           2   
2  6724a28c1714fb5e223f0239   30          9           9        4           3   
3  6724a28c1714fb5e223f023a   25          9           8        4           1   
4  6724a28c1714fb5e223f023a   25          1           1        0           2   

   OL_Sinhala_Tamil  AL_Passes  fail/pass  predicted_fail/pass  
0                 4          3          1                    1  
1                 4          4          1                    1  
2                 3          3          1                    1  
3                 3          4          0                    0  
4                 0          4          0                    0  
