# College Recruitment

### Importing the Dependencies

In [186]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

### Data Collection and Analysis

In [187]:
college_dataset = pd.read_csv('Placement_Data_Full_Class.csv')

In [188]:
college_dataset.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [189]:
college_dataset.shape

(215, 15)

In [190]:
college_dataset.describe() #you cannot describe non-numerical values

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [191]:
college_dataset['status'].value_counts()

Placed        148
Not Placed     67
Name: status, dtype: int64

In [192]:
college_dataset['status'].head()

0        Placed
1        Placed
2        Placed
3    Not Placed
4        Placed
Name: status, dtype: object

In [193]:
#replacing string status with numeric representation
college_dataset['status'] = college_dataset['status'].replace(['Placed', 'Not Placed'], [1, 0])

In [194]:
college_dataset['status'].head()

0    1
1    1
2    1
3    0
4    1
Name: status, dtype: int64

### 1 = Placed
### 0 = Not Placed

In [195]:
#check for null values
college_dataset.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [196]:
#replacing string workex with numeric representation - yes & no to 1 & 0
college_dataset['workex'] = college_dataset['workex'].replace(['Yes', 'No'], [1, 0])

### Separate features and labels, and drop unnecessary columns

In [197]:
temp_x = college_dataset.drop(columns=['sl_no', 'status', 'ssc_b', 'hsc_b', 'salary', 'gender', 'specialisation'], axis=1)
y = college_dataset['status']

In [198]:
temp_x.head()

Unnamed: 0,ssc_p,hsc_p,hsc_s,degree_p,degree_t,workex,etest_p,mba_p
0,67.0,91.0,Commerce,58.0,Sci&Tech,0,55.0,58.8
1,79.33,78.33,Science,77.48,Sci&Tech,1,86.5,66.28
2,65.0,68.0,Arts,64.0,Comm&Mgmt,0,75.0,57.8
3,56.0,52.0,Science,52.0,Sci&Tech,0,66.0,59.43
4,85.8,73.6,Commerce,73.3,Comm&Mgmt,0,96.8,55.5


In [199]:
#when building a model we cannot have string/text data
#so we have to use one-hot encoding to transform the columns in question
#but for now, we will leave this step out

"""
dummies = pd.get_dummies(temp_x.hsc_s)
dummies.head()
dummies_2 = pd.get_dummies(temp_x.degree_t)
one_hot_encoded = pd.concat([dummies, dummies_2], axis=1)
temp_one_hot = pd.concat([temp_x, one_hot_encoded], axis=1)
temp_one_hot = temp_one_hot.drop(['hsc_s', 'degree_t'], axis=1)
temp_one_hot.head()

"""

"\ndummies = pd.get_dummies(temp_x.hsc_s)\ndummies.head()\ndummies_2 = pd.get_dummies(temp_x.degree_t)\none_hot_encoded = pd.concat([dummies, dummies_2], axis=1)\ntemp_one_hot = pd.concat([temp_x, one_hot_encoded], axis=1)\ntemp_one_hot = temp_one_hot.drop(['hsc_s', 'degree_t'], axis=1)\ntemp_one_hot.head()\n\n"

In [200]:
X = temp_x.drop(['hsc_s', 'degree_t'], axis=1)
X.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,workex,etest_p,mba_p
0,67.0,91.0,58.0,0,55.0,58.8
1,79.33,78.33,77.48,1,86.5,66.28
2,65.0,68.0,64.0,0,75.0,57.8
3,56.0,52.0,52.0,0,66.0,59.43
4,85.8,73.6,73.3,0,96.8,55.5


### Data Standardization

In [201]:
scalar = StandardScaler()
scalar.fit(X)
standardized_data = scalar.transform(X)

In [203]:
#as you can see, all our values are in the range 0 - 1
#this will allow our model to make better predictions
print(standardized_data)

[[-0.02808697  2.2688123  -1.14010225 -0.72444647 -1.29109087 -0.59764672]
 [ 1.11336869  1.10344799  1.51326671  1.38036423  1.08715679  0.6876202 ]
 [-0.21323793  0.15331275 -0.32284282 -0.72444647  0.21890765 -0.76947385]
 ...
 [-0.02808697  0.06133451  0.90304633  1.38036423 -0.98909117  1.27870553]
 [ 0.61994138 -0.03064373 -1.14010225 -0.72444647 -0.15859198 -0.35193393]
 [-0.49096436 -0.76646966 -1.82115177 -0.72444647  1.27590661 -0.3536522 ]]


In [207]:
#X is now a numpy array because of the standardization step
X = standardized_data
print(pd.DataFrame(X))

            0         1         2         3         4         5
0   -0.028087  2.268812 -1.140102 -0.724446 -1.291091 -0.597647
1    1.113369  1.103448  1.513267  1.380364  1.087157  0.687620
2   -0.213238  0.153313 -0.322843 -0.724446  0.218908 -0.769474
3   -1.046417 -1.318339 -1.957362 -0.724446 -0.460592 -0.489396
4    1.712332  0.668391  0.943909 -0.724446  1.864806 -1.164676
..        ...       ...       ...       ...       ...       ...
210  1.230940  1.441008  1.529612 -0.724446  1.426906  2.098321
211 -0.861266 -0.582513  0.766836 -0.724446  0.143408 -1.487711
212 -0.028087  0.061335  0.903046  1.380364 -0.989091  1.278706
213  0.619941 -0.030644 -1.140102 -0.724446 -0.158592 -0.351934
214 -0.490964 -0.766470 -1.821152 -0.724446  1.275907 -0.353652

[215 rows x 6 columns]


### Split dataset into testing and training sets

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### What is stratification? In Simple terms: Grouping based on characteristics
<br/>
<blockquote>
This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.
For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.
</blockquote>

### Train the Model

In [209]:
classifier = svm.SVC(kernel='linear')

In [210]:
#training the support vector machine classifier
classifier.fit(X_train, y_train)

SVC(kernel='linear')

### Model Evaluation

In [211]:
#accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [215]:
print('Accuracy score of the training data: ', training_data_accuracy)

Accuracy score of the training data:  0.8953488372093024


In [216]:
#accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)

In [217]:
print('Accuracy score of the test data: ', test_data_accuracy)

Accuracy score of the test data:  0.8604651162790697


### Make the Predictive System

In [220]:
temp_x.head()

Unnamed: 0,ssc_p,hsc_p,hsc_s,degree_p,degree_t,workex,etest_p,mba_p
0,67.0,91.0,Commerce,58.0,Sci&Tech,0,55.0,58.8
1,79.33,78.33,Science,77.48,Sci&Tech,1,86.5,66.28
2,65.0,68.0,Arts,64.0,Comm&Mgmt,0,75.0,57.8
3,56.0,52.0,Science,52.0,Sci&Tech,0,66.0,59.43
4,85.8,73.6,Commerce,73.3,Comm&Mgmt,0,96.8,55.5


In [222]:
input_data = (58.28, 54.60, 50.20, 1, 76, 65.33)

#changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

#reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

#standardize the input data
std_data = scalar.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if prediction[0] == 0:
    print('Not Placed')
else:
    print('Placed')

[[-0.83534515 -1.07919568 -2.20253951  1.38036423  0.29440757  0.52438443]]
[0]
Not Placed


In [223]:
def get_prediction(TenthGradePercentage, TwelftGradePercentage, DegreePercentage, WorkExperience, EmployabilityTestPercentage, MBAPercentage):
    input_data = (TenthGradePercentage, TwelftGradePercentage, DegreePercentage, WorkExperience, EmployabilityTestPercentage, MBAPercentage)

    #changing the input_data to numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    #reshape the array as we are predicting for one instance
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    #standardize the input data
    std_data = scalar.transform(input_data_reshaped)
    print(std_data)

    prediction = classifier.predict(std_data)
    print(prediction)

    if prediction[0] == 0:
        return 'Not Placed'
    else:
        return 'Placed'

In [224]:
get_prediction(58.28, 54.60, 50.20, 1, 76, 65.33)

[[-0.83534515 -1.07919568 -2.20253951  1.38036423  0.29440757  0.52438443]]
[0]


'Not Placed'