### 1. Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, accuracy_score, confusion_matrix
from sklearn.feature_selection import f_regression
from statsmodels.stats.weightstats import ttest_ind, ztest
from statsmodels.stats import weightstats as stests
from statsmodels.stats.anova import anova_lm


In [None]:
2. Loading the Dataset

In [2]:
college_placement = pd.read_csv('collegePlace.csv')

In [3]:
college_placement

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,Male,Electronics And Communication,1,8,1,1,1
1,21,Female,Computer Science,0,7,1,1,1
2,22,Female,Information Technology,1,6,0,0,1
3,21,Male,Information Technology,0,8,0,1,1
4,22,Male,Mechanical,0,8,1,0,1
...,...,...,...,...,...,...,...,...
2961,23,Male,Information Technology,0,7,0,0,0
2962,23,Male,Mechanical,1,7,1,0,0
2963,22,Male,Information Technology,1,7,0,0,0
2964,22,Male,Computer Science,1,7,0,0,0


In [None]:
3. Variance

In [4]:
variance = college_placement.var()

  variance = college_placement.var()


In [5]:
variance

Age                  1.755449
Internships          0.547892
CGPA                 0.936536
Hostel               0.196728
HistoryOfBacklogs    0.155298
PlacedOrNot          0.247317
dtype: float64

In [8]:
# Checking for missing values
college_placement.isnull().sum()

Age                  0
Gender               0
Stream               0
Internships          0
CGPA                 0
Hostel               0
HistoryOfBacklogs    0
PlacedOrNot          0
dtype: int64

In [9]:
# Summary statistics
college_placement.describe()

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
count,2966.0,2966.0,2966.0,2966.0,2966.0,2966.0
mean,21.48584,0.703641,7.073837,0.269049,0.192178,0.552596
std,1.324933,0.740197,0.967748,0.44354,0.394079,0.49731
min,19.0,0.0,5.0,0.0,0.0,0.0
25%,21.0,0.0,6.0,0.0,0.0,0.0
50%,21.0,1.0,7.0,0.0,0.0,1.0
75%,22.0,1.0,8.0,1.0,0.0,1.0
max,30.0,3.0,9.0,1.0,1.0,1.0


In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
#Instantiate LabelEncoder and Fit-Transform the "Gender" column
label_encoder = LabelEncoder()
#encoding Gender column
college_placement['Gender'] = label_encoder.fit_transform(college_placement['Gender'])

In [12]:
#Verify the Encoding
print("Encoded Gender values:", college_placement['Gender'].unique())

Encoded Gender values: [1 0]


In [13]:
college_placement

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,1,Electronics And Communication,1,8,1,1,1
1,21,0,Computer Science,0,7,1,1,1
2,22,0,Information Technology,1,6,0,0,1
3,21,1,Information Technology,0,8,0,1,1
4,22,1,Mechanical,0,8,1,0,1
...,...,...,...,...,...,...,...,...
2961,23,1,Information Technology,0,7,0,0,0
2962,23,1,Mechanical,1,7,1,0,0
2963,22,1,Information Technology,1,7,0,0,0
2964,22,1,Computer Science,1,7,0,0,0


In [14]:
college_placement['Stream'].unique()

array(['Electronics And Communication', 'Computer Science',
       'Information Technology', 'Mechanical', 'Electrical', 'Civil'],
      dtype=object)

In [15]:
#encoding Stream column
college_placement['Stream'] = label_encoder.fit_transform(college_placement['Stream'])

In [16]:
college_placement['Stream'].unique()

array([3, 1, 4, 5, 2, 0])

In [17]:
college_placement

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,1,3,1,8,1,1,1
1,21,0,1,0,7,1,1,1
2,22,0,4,1,6,0,0,1
3,21,1,4,0,8,0,1,1
4,22,1,5,0,8,1,0,1
...,...,...,...,...,...,...,...,...
2961,23,1,4,0,7,0,0,0
2962,23,1,5,1,7,1,0,0
2963,22,1,4,1,7,0,0,0
2964,22,1,1,1,7,0,0,0


In [None]:
4. Z-Score Method, Z test

In [18]:
z_scores = stats.zscore(college_placement)

In [19]:
z_scores

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,0.388131,0.445403,0.264655,0.400445,0.957191,1.648269,2.050246,0.899800
1,-0.366752,-2.245158,-0.944846,-0.950773,-0.076310,1.648269,2.050246,0.899800
2,0.388131,-2.245158,0.869405,0.400445,-1.109812,-0.606697,-0.487746,0.899800
3,-0.366752,0.445403,0.869405,-0.950773,0.957191,-0.606697,2.050246,0.899800
4,0.388131,0.445403,1.474156,-0.950773,0.957191,1.648269,-0.487746,0.899800
...,...,...,...,...,...,...,...,...
2961,1.143013,0.445403,0.869405,-0.950773,-0.076310,-0.606697,-0.487746,-1.111358
2962,1.143013,0.445403,1.474156,0.400445,-0.076310,1.648269,-0.487746,-1.111358
2963,0.388131,0.445403,0.869405,0.400445,-0.076310,-0.606697,-0.487746,-1.111358
2964,0.388131,0.445403,-0.944846,0.400445,-0.076310,-0.606697,-0.487746,-1.111358


In [None]:
5. T test

In [20]:
t_test = stats.ttest_1samp(college_placement['CGPA'], 7)

In [21]:
t_test

TtestResult(statistic=4.15524145025345, pvalue=3.3413287076854306e-05, df=2965)

In [None]:
6. Hypothesis Test

In [22]:
hyp_test = stats.ttest_ind(college_placement['CGPA'], college_placement['Internships'])


In [23]:
hyp_test

Ttest_indResult(statistic=284.74670038219386, pvalue=0.0)

In [None]:
7. T-Test

In [24]:
t_test = stats.ttest_rel(college_placement['CGPA'], college_placement['Internships'])


In [25]:
t_test

TtestResult(statistic=288.03114204741604, pvalue=0.0, df=2965)

In [None]:
8. 2 Sample Hypo(Mean) Test

In [26]:
t_test_2sample = stats.ttest_ind(college_placement['CGPA'], college_placement['Internships'])

In [27]:
t_test_2sample

Ttest_indResult(statistic=284.74670038219386, pvalue=0.0)

In [None]:
9. Paired T-Test

In [28]:
paired_t_test = stats.ttest_rel(college_placement['CGPA'], college_placement['Internships'])

In [29]:
paired_t_test

TtestResult(statistic=288.03114204741604, pvalue=0.0, df=2965)

In [None]:
10. Chi Square Test

In [31]:
#chi_square = stats.chisquare(college_placement['Stream'], college_placement['Gender'])

In [None]:
11. ANOVA

In [32]:
anova = stats.f_oneway(college_placement['CGPA'], college_placement['Internships'])

In [33]:
anova

F_onewayResult(statistic=81080.68337854675, pvalue=0.0)

In [None]:
12. Preprocessing

In [34]:
X = college_placement.drop('PlacedOrNot', axis=1)
y = college_placement['PlacedOrNot']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
13. Linear Regression Model

In [35]:
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train_scaled, y_train)

In [None]:
14. Find Adjusted R2

In [36]:
n = len(X_train_scaled)
p = X_train_scaled.shape[1]
r_squared = lin_reg_model.score(X_train_scaled, y_train)
adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

In [37]:
r_squared

0.40180128796481795

In [38]:
adjusted_r_squared

0.40002997198163426

In [None]:
15. Logistic Regression Model

In [39]:
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train_scaled, y_train)

In [None]:
16. Decision Tree Model

In [40]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)

In [None]:
17. Model Evaluation

In [41]:
# Linear Regression
lin_reg_pred = lin_reg_model.predict(X_test_scaled)
lin_reg_r2 = r2_score(y_test, lin_reg_pred)

# Logistic Regression
log_reg_pred = log_reg_model.predict(X_test_scaled)
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)

# Decision Tree
dt_pred = dt_model.predict(X_test_scaled)
dt_accuracy = accuracy_score(y_test, dt_pred)

best_model = max(lin_reg_r2, log_reg_accuracy, dt_accuracy)

In [None]:
Conclusion

In [42]:
if best_model == lin_reg_r2:
    print("Linear Regression model is the best.")
elif best_model == log_reg_accuracy:
    print("Logistic Regression model is the best.")
else:
    print("Decision Tree model is the best.")


Decision Tree model is the best.
