In [17]:
# Internship Project - Task 2: Student Success Prediction
# Description:
# This project builds a machine learning classification model to predict student success in internship programs
# based on academic performance, participation, and skill metrics.

# Step 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Step 2: Load Dataset
df = pd.read_csv('student_success_data.csv')
print("Dataset Preview:")
print(df.head())

Dataset Preview:
   StudentID  CGPA  ProjectsDone  ParticipationHours  Domain  \
0       1000  8.24             0                  22  WebDev   
1       1001  6.49             0                  39      AI   
2       1002  7.48             0                  10  WebDev   
3       1003  7.77             0                  40  WebDev   
4       1004  8.27             4                  31  WebDev   

  CommunicationSkills  Success  
0                 Low        0  
1                High        0  
2                 Low        0  
3                High        1  
4              Medium        1  


In [5]:
# Step 3: Preprocessing
label_encoders = {}
for col in ['Domain', 'CommunicationSkills']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [7]:
# Features and Target
X = df.drop(['StudentID', 'Success'], axis=1)
y = df['Success']


In [9]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [13]:
# Step 5: Train Models and Evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nModel: {name}")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       1.00      0.67      0.80         6

    accuracy                           0.90        20
   macro avg       0.94      0.83      0.87        20
weighted avg       0.91      0.90      0.89        20

Confusion Matrix:
[[14  0]
 [ 2  4]]

Model: Random Forest
              precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       1.00      0.67      0.80         6

    accuracy                           0.90        20
   macro avg       0.94      0.83      0.87        20
weighted avg       0.91      0.90      0.89        20

Confusion Matrix:
[[14  0]
 [ 2  4]]

Model: SVM
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        14
           1       1.00      0.17      0.29         6

    accuracy                           0.75   

In [15]:
# Step 6: Conclusion
print("""
Conclusion:
- We trained three classifiers to predict student success in internships.
- Based on evaluation metrics, the most accurate model can be recommended for use.
- This model helps MITS understand what factors contribute to internship success and can guide future student preparation.
""")


Conclusion:
- We trained three classifiers to predict student success in internships.
- Based on evaluation metrics, the most accurate model can be recommended for use.
- This model helps MITS understand what factors contribute to internship success and can guide future student preparation.

