# Campus Placement Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Dataset/main.csv")

In [3]:
df.head()

In [4]:
df.shape

In [5]:
df.info()

In [6]:
df.isnull().sum()

In [7]:
df.drop(['salary','sl_no','ssc_b','hsc_b'], axis=1, inplace=True)

In [8]:
df.isnull().sum()

## Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [10]:
df["degree_t"] = df["degree_t"].astype('category')

In [11]:
df["workex"] = df["workex"].astype('category')

In [12]:
df["specialisation"] = df["specialisation"].astype('category')

In [13]:
df["status"] = df["status"].astype('category')

In [14]:
df["gender"] = df["gender"].astype('category')

In [15]:
df["hsc_s"] = df["hsc_s"].astype('category')

In [16]:
df["workex"] = df["workex"].cat.codes

In [17]:
df["gender"] = df["gender"].cat.codes

In [18]:
df["degree_t"] = df["degree_t"].cat.codes

In [19]:
df["specialisation"] = df["specialisation"].cat.codes

In [20]:
df["status"] = df["status"].cat.codes

In [21]:
df["hsc_s"] = df["hsc_s"].cat.codes

In [22]:
df.dtypes

In [23]:
df.head()

In [24]:
df.describe()

## Checking for Outliers

In [25]:
import seaborn as sns
fig, axs = plt.subplots(ncols=5,nrows=3,figsize=(20,10))
index = 0
axs = axs.flatten()
for k,v in df.items():
    sns.boxplot(y=v, ax=axs[index])
    index+=1

fig.delaxes(axs[index])
plt.tight_layout(pad=0.3, w_pad=0.5,h_pad = 4.5) 

In [26]:
# deleting some outliers in 2 columns degree_p and hsc_p
df = df[~(df['degree_p']>=90)]
df = df[~(df['hsc_p']>=95)]

## Correlation

In [27]:
cor = df.corr()
cor

In [28]:
import seaborn as sns
from matplotlib.pyplot import figure

In [29]:
fig = plt.figure(figsize=(10, 10))
sns.heatmap(cor.abs(), annot=True, cmap=plt.cm.CMRmap_r)

In [30]:
df.shape

In [31]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

## Model Tranning

In [32]:
df.columns

In [33]:
df_class = df.copy()
x = df_class.iloc[:,0:-1].values
y = df_class.iloc[:,-1].values

In [34]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.18,random_state=0)

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
sc= StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

### Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
y1 = lr.predict(x_test)

In [None]:
y1

In [None]:
s1 = accuracy_score(y_test,y1)

In [None]:
s1

In [None]:
confusion_matrix(y_test,y1)

In [None]:
print(classification_report(y_test,y1))

### KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
y2 = knn.predict(x_test)

In [None]:
y2

In [None]:
s2 = accuracy_score(y_test,y2)
s2

In [None]:
confusion_matrix(y_test,y2)

In [None]:
print(classification_report(y_test,y2))

### SVM Classifier

In [None]:
# SVM
from sklearn import svm
svm = svm.SVC(kernel="linear")
svm.fit(x_train,y_train)
y3 = svm.predict(x_test)

In [None]:
y3

In [None]:
s3 = accuracy_score(y_test,y3)
s3

In [None]:
confusion_matrix(y_test,y3)

In [None]:
print(classification_report(y_test,y3))

### DecisionTree Classifier

In [None]:
# DecisionTree Classifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
y4 = dt.predict(x_test)

In [None]:
y4

In [None]:
s4 = accuracy_score(y_test,y4)
s4

In [None]:
confusion_matrix(y_test,y4)

In [None]:
print(classification_report(y_test,y4))

### RandomForest Classifier

In [None]:
# RandomForest Classifier
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
y5 = rf.predict(x_test)

In [None]:
y5

In [None]:
s5 = accuracy_score(y_test,y5)
s5

In [None]:
confusion_matrix(y_test,y5)

In [None]:
print(classification_report(y_test,y5))

### Gradient Boosting Classifier

In [None]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(x_train,y_train)
y6 = gb.predict(x_test)

In [None]:
y6

In [None]:
s6 = accuracy_score(y_test,y6)
s6

In [None]:
confusion_matrix(y_test,y5)

In [None]:
print(classification_report(y_test,y5))

In [None]:
final_data = pd.DataFrame({'Models':['LogisticRegression','KNN','SVM','DecisionTreeClassifier',
                                     'RandomForestClassifier','GradientBoostingClassifier'],
            'ACC':[s1*100,
                  s2*100,
                  s3*100,
                  s4*100,
                  s5*100,s6*100]})

In [None]:
final_data

In [None]:
final_data['Models']

In [None]:
import seaborn as sns

In [None]:
sns.barplot(x=final_data['Models'], y = final_data['ACC'], alpha=0.8)
plt.xticks(rotation = 75, fontsize = 13)

In [None]:
df.info()

##  Model Saving

In [None]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
new_data = pd.DataFrame({
    'gender':0,
    'ssc_p':67.0,
    'hsc_p':91.0,
    'hsc_s':1,
    'degree_p':58.0,
    'degree_t':2,
    'workex':0,
    'etest_p':55.0,
     'specialisation':1,
    'mba_p':58.8,   
},index=[0])

In [None]:
lr= LogisticRegression()
lr.fit(x,y,sample_weight=None)

In [None]:
p = lr.predict(new_data)
prob=lr.predict_proba(new_data)
if p == 1:
    print('Placed')
    print(f"You will be placed with probability of {prob[0][1]:.2f}")
else:
    print("Not-placed")

In [None]:
import pickle

In [None]:
pickle.dump(lr, open('model.pkl','wb'))

In [None]:
model = pickle.load(open('model.pkl','rb'))

In [None]:
model.predict(scaler.transform(x.reshape(1,-1)))