In [20]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2

In [21]:
data = pd.read_csv('data.csv')

data.head()

data = data.fillna(0)

data = data.drop(['id','email'],axis=1)

features_to_split = ['branch']
for feature in features_to_split:
  dummy = pd.get_dummies(data[feature])
  data = pd.concat([data, dummy], axis=1)
  data.drop(feature, axis=1, inplace=True)

In [22]:
data.head()

Unnamed: 0,gender,ssc,hsc,quantitative_ability,logical_reasoning,english_proficiency,automata_score,computer_science_score,internships,backlogs,projects,cgpa,placed_sector,CS,EnTC,IT
0,0,87.85,79.771,79.005,99.934,71.921,91.437,91.415,0.0,0.0,4,7.53,FinTech,0,0,1
1,0,53.307,57.952,90.76,68.509,85.333,90.539,94.626,5.0,0.0,1,8.13,Startup,1,0,0
2,1,79.973,88.228,87.872,85.311,99.404,71.203,74.121,2.0,0.0,7,9.97,Product,0,1,0
3,0,97.934,74.965,81.667,92.008,77.299,71.309,89.063,3.0,0.0,1,9.34,FinTech,0,0,1
4,0,82.761,57.869,72.017,80.981,94.424,94.019,78.379,3.0,0.0,10,6.36,Startup,1,0,0


Normalize the numerical features

In [23]:
scaler = MinMaxScaler()
num_cols = ['ssc', 'hsc', 'quantitative_ability', 'logical_reasoning', 'english_proficiency', 'automata_score', 'computer_science_score', 'internships', 'backlogs', 'projects', 'cgpa']
data[num_cols] = scaler.fit_transform(data[num_cols])

data.head()

Unnamed: 0,gender,ssc,hsc,quantitative_ability,logical_reasoning,english_proficiency,automata_score,computer_science_score,internships,backlogs,projects,cgpa,placed_sector,CS,EnTC,IT
0,0,0.757112,0.595916,0.400561,0.999828,0.197941,0.755511,0.755903,0.0,0.0,0.333333,0.380952,FinTech,0,0,1
1,0,0.063658,0.159143,0.736937,0.100378,0.581524,0.729836,0.847812,1.0,0.0,0.0,0.531328,Startup,1,0,0
2,1,0.59898,0.765209,0.654295,0.581287,0.983955,0.176984,0.260898,0.4,0.0,0.666667,0.992481,Product,0,1,0
3,0,0.959549,0.49971,0.476736,0.772969,0.351752,0.180014,0.688582,0.6,0.0,0.0,0.834586,FinTech,0,0,1
4,0,0.65495,0.157482,0.200595,0.457353,0.841527,0.829335,0.382775,0.6,0.0,1.0,0.087719,Startup,1,0,0


Select the K best features

In [24]:
X = data.drop('placed_sector', axis=1)
y = data['placed_sector']

selector = SelectKBest(chi2, k=5)
selector.fit(X, y)

In [25]:
X_new = selector.transform(X)
selected_features = X.columns[selector.get_support(indices=True)]
print("Selected features:", selected_features)

Selected features: Index(['ssc', 'logical_reasoning', 'backlogs', 'CS', 'IT'], dtype='object')


In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [27]:
clf = GaussianNB()

In [28]:
clf.fit(X_train,y_train)

In [29]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {acc_score}")

Accuracy Score: 0.195


In [30]:
new_data = pd.read_csv('newdata.csv')
new_data[num_cols] = scaler.transform(new_data[num_cols])

new_data.head()

Unnamed: 0,gender,ssc,hsc,quantitative_ability,logical_reasoning,english_proficiency,automata_score,computer_science_score,internships,backlogs,projects,cgpa,CS,EnTC,IT
0,0,0.868528,0.700591,0.572111,0.858607,0.715001,0.943159,0.88714,0.4,0.0,0.333333,0.899749,0,0,1
1,1,0.378696,0.380302,0.114262,0.114431,0.1144,0.113996,0.11432,0.4,0.4,0.111111,0.498747,1,0,0


In [31]:
new_data = new_data[selected_features]
print(new_data)

new_predictions = clf.predict(new_data)

print(new_predictions)

        ssc  logical_reasoning  backlogs  CS  IT
0  0.868528           0.858607       0.0   0   1
1  0.378696           0.114431       0.4   1   0
['Startup' 'Service']




In [32]:
new_predictions_proba = clf.predict_proba(new_data)
print(new_predictions_proba)

[[0.254207   0.20926504 0.1722436  0.36428437]
 [0.23519521 0.29440328 0.30468371 0.1657178 ]]




In [37]:
# Create a dictionary to store the predicted probabilities for each Placed Sector category
output = []
for i in range(len(new_predictions_proba)):
    row_dict = {}
    for j, category in enumerate(clf.classes_):
      row_dict[category] = new_predictions_proba[i][j]
    output.append(row_dict)
print(output)

[{'FinTech': 0.2542069965536405, 'Product': 0.20926503545243189, 'Service': 0.17224360063766458, 'Startup': 0.36428436735626313}, {'FinTech': 0.23519521447932648, 'Product': 0.29440327796064986, 'Service': 0.304683708940731, 'Startup': 0.16571779861929287}]
