In [52]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
import os
# Load data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [53]:
# Define the correct target column
target_column = 'target'

In [54]:
# Define features and target
X = train_data.drop(target_column, axis=1)
y = train_data[target_column]

In [55]:
# Split the data
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [56]:
# Check the shape of the data
print("Train shapes (X, y): ", X_train.shape, y_train.shape)
print("Test shapes (X, y): ", X_test.shape, y_test.shape)

Train shapes (X, y):  (3427, 14) (3427,)
Test shapes (X, y):  (857, 14) (857,)


In [57]:
# Initialize and fit the StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [58]:
# Model selection (Random Forest)   
# initializes a Random Forest classifier and trains
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)


In [59]:
# Predict on the test set
y_pred = model.predict(X_test_scaled)

In [60]:
# Calculate and print metrics
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Report: ", classification_report(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

Accuracy:  0.808634772462077
Report:                precision    recall  f1-score   support

           0       0.79      0.85      0.82        59
           1       0.84      0.93      0.88        44
           2       0.93      0.89      0.91        57
           3       0.87      0.90      0.89        60
           4       0.63      0.37      0.46        52
           5       0.33      0.31      0.32        42
           6       0.86      0.94      0.90        53
           7       0.98      0.98      0.98        44
           8       0.85      0.98      0.91        53
           9       0.66      0.50      0.57        50
          10       0.88      0.93      0.90        45
          11       0.90      0.71      0.80        52
          12       0.92      0.83      0.87        54
          13       0.64      0.89      0.74        54
          14       0.85      0.98      0.91        53
          15       0.82      0.78      0.80        41
          16       0.87      0.89      0.88

In [61]:
# Single input prediction example
print("Actual Label :", y_test.iloc[10])
print("Model Prediction :", model.predict(X_test_scaled[10].reshape(1, -1))[0])
if y_test.iloc[10] == model.predict(X_test_scaled[10].reshape(1, -1)):
    print("Wow! Model doing well.....")
else:
    print("Not sure......")

Actual Label : 10
Model Prediction : 10
Wow! Model doing well.....


In [64]:
# Recommendation system function
class_names = ['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
               'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
               'Banker', 'Writer', 'Accountant', 'Designer',
               'Construction Engineer', 'Game Developer', 'Stock Investor',
               'Real Estate Developer']

def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score, average_score):
    
    # Encode categorical variables
    gender_encoded = 1 if gender.lower() == 'female' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0
    
    # Create feature array with feature names
    feature_array = pd.DataFrame({
        'gender': [gender_encoded],
        'part_time_job': [part_time_job_encoded],
        'absence_days': [absence_days],
        'extracurricular_activities': [extracurricular_activities_encoded],
        'weekly_self_study_hours': [weekly_self_study_hours],
        'math_score': [math_score],
        'history_score': [history_score],
        'physics_score': [physics_score],
        'chemistry_score': [chemistry_score],
        'biology_score': [biology_score],
        'english_score': [english_score],
        'geography_score': [geography_score],
        'total_score': [total_score],
        'average_score': [average_score]
    })
    
    # Scale features
    scaled_features = scaler.transform(feature_array)
    
    # Predict using the model
    probabilities = model.predict_proba(scaled_features)
    
    # Get top five predicted classes along with their probabilities
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    top_classes_names_probs = [(class_names[idx], probabilities[0][idx]) for idx in top_classes_idx]
    
    return top_classes_names_probs


In [65]:
# Example usage
final_recommendations = Recommendations(gender='female',
                                        part_time_job=False,
                                        absence_days=2,
                                        extracurricular_activities=False,
                                        weekly_self_study_hours=7,
                                        math_score=50,
                                        history_score=60,
                                        physics_score=97,
                                        chemistry_score=94,
                                        biology_score=90,
                                        english_score=81,
                                        geography_score=66,
                                        total_score=534,
                                        average_score=76.29)

print("Top recommended career aspirations with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability:.2f}")

Top recommended career aspirations with probabilities:
Real Estate Developer with probability 0.38
Unknown with probability 0.26
Teacher with probability 0.16
Business Owner with probability 0.07
Government Officer with probability 0.06
