In [1]:
# TA Performance Classification - Capstone Project
## Teaching Assistant Evaluation Data Analysis & Modeling

In [2]:
# 1. IMPORTS & SETUP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Configure visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print('All imports successful!')

All imports successful!


In [3]:

df = pd.read_csv('tae.csv', header=None)
df.columns = ['native_speaker', 'course_instructor', 'course', 'semester', 'class_size', 'class_attribute']

print('Dataset Shape:', df.shape)
print('\nFirst 5 rows:')
print(df.head())

print('Data Info:')
print(df.info())

print('Class Distribution:')
print(df['class_attribute'].value_counts().sort_index())

Dataset Shape: (151, 6)

First 5 rows:
   native_speaker  course_instructor  course  semester  class_size  \
0               1                 23       3         1          19   
1               2                 15       3         1          17   
2               1                 23       3         2          49   
3               1                  5       2         2          33   
4               2                  7      11         2          55   

   class_attribute  
0                3  
1                3  
2                3  
3                3  
4                3  
Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   native_speaker     151 non-null    int64
 1   course_instructor  151 non-null    int64
 2   course             151 non-null    int64
 3   semester           151 non-null    int64
 4   class_size        

In [4]:

# Separate features and target
X = df.drop('class_attribute', axis=1)
y = df['class_attribute']

# Identify numeric and categorical features
numeric_features = ['class_size']
categorical_features = ['native_speaker', 'course_instructor', 'course', 'semester']

# Create preprocessing pipeline
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
        transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features),
                ]
    )

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')

Training set size: 120
Test set size: 31


In [5]:
# 4. MODEL TRAINING & COMPARISON
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

results = []
model_pipelines = {}

for name, model in models.items():
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    clf.fit(X_train, y_train)
    model_pipelines[name] = clf
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cv_scores = cross_val_score(clf, X, y, cv=5)
    
    results.append({
        'Model': name,
        'Test_Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1_Score': f1,
        'CV_Mean': cv_scores.mean(),
        'CV_Std': cv_scores.std()
    })

    # Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test_Accuracy', ascending=False)
print('\n=== MODEL COMPARISON REPORT ===')
print(results_df.to_string(index=False))


=== MODEL COMPARISON REPORT ===
              Model  Test_Accuracy  Precision   Recall  F1_Score  CV_Mean   CV_Std
      Random Forest       0.677419   0.682698 0.677419  0.678955 0.686667 0.268825
  Gradient Boosting       0.548387   0.609901 0.548387  0.554351 0.680215 0.234642
Logistic Regression       0.516129   0.515803 0.516129  0.515240 0.587957 0.136388
        Naive Bayes       0.451613   0.589785 0.451613  0.415430 0.429677 0.109396
                KNN       0.290323   0.299731 0.290323  0.262737 0.409677 0.082186
