In [2]:
# Step 1: Data Preparation
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel

# Load the dataset
data = pd.read_csv('data.csv')

data = data.fillna(0)

data = data.drop(['id','email'],axis=1)

features_to_split = ['branch']
for feature in features_to_split:
    dummy = pd.get_dummies(data[feature])
    data = pd.concat([data, dummy], axis=1)
    data.drop(feature, axis=1, inplace=True)

# Normalize the numerical features
scaler = MinMaxScaler()
num_cols = ['ssc', 'hsc', 'quantitative_ability', 'logical_reasoning', 'english_proficiency', 'automata_score', 'computer_science_score', 'internships', 'backlogs', 'projects', 'cgpa']
data[num_cols] = scaler.fit_transform(data[num_cols])

# Step 2: Feature Selection
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = data.drop('placed_sector', axis=1)
y = data['placed_sector']

# Use Random Forest for feature selection
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), max_features=5)
sel.fit(X, y)

# Print the selected features
selected_feat= X.columns[(sel.get_support())]
print('Selected Features:', selected_feat)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[selected_feat], y, test_size=0.2, random_state=42)

# Step 3: Model Training
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 4: Model Evaluation
# Evaluate the performance of the model using metrics like accuracy, precision, recall, F1-score, and confusion matrix.
y_pred = clf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {acc_score}")

y_pred_proba = clf.predict_proba(X_test)
print(y_pred_proba)

new_data = pd.read_csv('newdata.csv')
new_data[num_cols] = scaler.transform(new_data[num_cols])

new_data = new_data[selected_feat]
# print(new_data)

new_predictions = clf.predict(new_data)

print(new_predictions)

new_predictions_proba = clf.predict_proba(new_data)
print(new_predictions_proba)
output = []
for i in range(len(new_predictions_proba)):
    row_dict = {}
    for j, category in enumerate(clf.classes_):
      row_dict[category] = new_predictions_proba[i][j]
    output.append(row_dict)
print(output)

Selected Features: Index(['hsc', 'quantitative_ability', 'logical_reasoning',
       'english_proficiency', 'computer_science_score'],
      dtype='object')
Accuracy Score: 0.275
[[0.22 0.35 0.16 0.27]
 [0.08 0.27 0.4  0.25]
 [0.13 0.11 0.25 0.51]
 [0.26 0.4  0.16 0.18]
 [0.06 0.23 0.52 0.19]
 [0.25 0.18 0.27 0.3 ]
 [0.3  0.22 0.23 0.25]
 [0.41 0.19 0.09 0.31]
 [0.27 0.28 0.34 0.11]
 [0.12 0.43 0.17 0.28]
 [0.21 0.33 0.32 0.14]
 [0.43 0.16 0.16 0.25]
 [0.19 0.41 0.17 0.23]
 [0.1  0.26 0.12 0.52]
 [0.42 0.15 0.25 0.18]
 [0.38 0.2  0.13 0.29]
 [0.27 0.23 0.38 0.12]
 [0.07 0.23 0.22 0.48]
 [0.13 0.35 0.26 0.26]
 [0.33 0.31 0.11 0.25]
 [0.38 0.04 0.32 0.26]
 [0.48 0.2  0.19 0.13]
 [0.21 0.11 0.38 0.3 ]
 [0.51 0.11 0.12 0.26]
 [0.05 0.41 0.23 0.31]
 [0.16 0.22 0.12 0.5 ]
 [0.22 0.29 0.17 0.32]
 [0.27 0.34 0.28 0.11]
 [0.32 0.02 0.34 0.32]
 [0.17 0.3  0.34 0.19]
 [0.13 0.33 0.16 0.38]
 [0.14 0.21 0.15 0.5 ]
 [0.22 0.32 0.26 0.2 ]
 [0.31 0.11 0.36 0.22]
 [0.12 0.29 0.4  0.19]
 [0.18 0.29 0.09