In [11]:
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
import os

In [12]:
# Load data from the actual CSV file (replace with your actual file path)
df = pd.read_csv("FY25_Candidates database.csv")

# Replace various forms of NA/empty values with numpy.nan
na_values = ['NA', 'Na', 'na', 'N/A', 'n/a', '', 'None', 'none', ' ', 'NaN', 'nan']
df.replace(na_values, np.nan, inplace=True)

# Convert specific columns to numeric, coercing errors to NaN
numeric_cols = ['Graduation CGPA', 'Total Experienced (In Years)', '10th Board%', '12th Board%']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill remaining NA values in key columns with appropriate defaults
df['Graduation CGPA'] = df['Graduation CGPA'].fillna(0)
df['Total Experienced (In Years)'] = df['Total Experienced (In Years)'].fillna(0)
df['Technonlogies/Skills known'] = df['Technonlogies/Skills known'].fillna('')

# Verify the replacements
print("NA values after cleaning:")
print(df.isna().sum())


NA values after cleaning:
S.No                                                                0
College Full Name                                                   0
College City                                                        1
Roll No / PRN                                                       2
Prefix                                                              1
Candidate Name                                                      0
Gender                                                              0
Candidate Mobile Number                                             0
Alternate Mobile Number                                             2
Primary Email ID (College)                                          0
Alternate Email ID                                                  1
10th Board%                                                         0
12th Board%                                                         3
Graduation Degree                                               

  df.replace(na_values, np.nan, inplace=True)


In [13]:
# Define the mindset logic with proper NaN handling
def generate_mindset(row):
    cgpa = row['Graduation CGPA']
    exp = row['Total Experienced (In Years)']
    
    if pd.isna(cgpa) or pd.isna(exp):
        return "Neutral"
    elif cgpa >= 8 and exp >= 1:
        return "Growth"
    elif cgpa < 6:
        return "Fixed"
    else:
        return "Neutral"

# Apply and create the column
df["Mindset"] = df.apply(generate_mindset, axis=1)

# Print mindset distribution
print(df["Mindset"].value_counts())

Mindset
Neutral    41
Fixed       4
Name: count, dtype: int64


In [14]:
print("🎯 CGPA >= 8 & Experience >= 1:", df[(df["Graduation CGPA"] >= 8) & (df["Total Experienced (In Years)"] >= 1)].shape[0])
print("🛑 CGPA < 6:", df[df["Graduation CGPA"] < 6].shape[0])
print("🟡 Else (Neutral):", df[((df["Graduation CGPA"] >= 6) & (df["Graduation CGPA"] < 8)) | (df["Total Experienced (In Years)"] < 1)].shape[0])


🎯 CGPA >= 8 & Experience >= 1: 0
🛑 CGPA < 6: 4
🟡 Else (Neutral): 45


In [15]:
# Generate Skill Count from comma-separated skills
df["Skill_Count"] = df["Technonlogies/Skills known"].apply(lambda x: len(str(x).split(',')) if pd.notnull(x) else 0)


In [16]:
# Map Language Proficiency to numeric
lang_map = {"Beginner": 0, "Advanced": 1, "Mastery": 2, None: 0, np.nan: 0}
df["Language_Proficiency"] = df["Proficiency in foreign language (Beginner / Advanced / Mastery)"].map(lang_map).fillna(0)

# Features (X) and Target (y)
features = ["Graduation CGPA", "Year of Graduation", "Total Experienced (In Years)", "Skill_Count", "Language_Proficiency"]
X = df[features].fillna(0)  # Fill remaining NA values with 0
y = df["Mindset"]


In [17]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
# Train model
mindset_model = RandomForestClassifier(random_state=42)
mindset_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = mindset_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       Fixed       1.00      1.00      1.00         1
     Neutral       1.00      1.00      1.00         8

    accuracy                           1.00         9
   macro avg       1.00      1.00      1.00         9
weighted avg       1.00      1.00      1.00         9



In [19]:
# Save model - create directory if it doesn't exist
model_path = "models"
os.makedirs(model_path, exist_ok=True)

with open(os.path.join(model_path, "mindset_model.pkl"), "wb") as f:
    pickle.dump(mindset_model, f)

with open(os.path.join(model_path, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

print("✅ Model and scaler saved successfully.")


✅ Model and scaler saved successfully.
