<a href="https://colab.research.google.com/github/Chiyam-Chandar/Teaching-Assistant-Performance-Prediction/blob/main/Teaching_Assistant_Performance_Prediction_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Teaching-Assistant-Performance-Prediction

Step 1: Data Collection & Understanding


In [2]:
#Imports the Pandas library, which is used for data analysis and data manipulation in Python.
import pandas as pd


df = pd.read_csv('teaching_assistant_data.csv')
df.head(8)


Unnamed: 0,1,23,3,1.1,19,3.1
0,2,15,3,1,17,3
1,1,23,3,2,49,3
2,1,5,2,2,33,3
3,2,7,11,2,55,3
4,2,23,3,1,20,3
5,2,9,5,2,19,3
6,2,10,3,2,27,3
7,1,22,3,1,58,3


In [3]:
#This code is used to check the structure of the dataset after loading it.
print("Rows, Columns:", df.shape)
print("Columns:", df.columns.tolist())


Rows, Columns: (150, 6)
Columns: ['1', '23', '3', '1.1', '19', '3.1']


In [5]:
#This command provides a summary of the datasetâ€™s information.
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   1       150 non-null    int64
 1   23      150 non-null    int64
 2   3       150 non-null    int64
 3   1.1     150 non-null    int64
 4   19      150 non-null    int64
 5   3.1     150 non-null    int64
dtypes: int64(6)
memory usage: 7.2 KB


In [6]:
#This command shows important information about the dataset such as columns, data types, and missing values.
df.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1.0,150.0,1.813333,0.390949,1.0,2.0,2.0,2.0,2.0
23.0,150.0,13.58,6.805318,1.0,8.0,13.0,20.0,25.0
3.0,150.0,8.14,7.034937,1.0,3.0,4.5,15.0,26.0
1.1,150.0,1.853333,0.354958,1.0,2.0,2.0,2.0,2.0
19.0,150.0,27.926667,12.916405,3.0,19.0,27.0,37.0,66.0
3.1,150.0,2.013333,0.819123,1.0,1.0,2.0,3.0,3.0


In [9]:
#This code is used to analyze the distribution of values in selected columns of the dataset.
(df['3.1'].value_counts())
(df['3.1'].value_counts(normalize=True).round(3))

(df['1'].value_counts())
(df['1.1'].value_counts())

Unnamed: 0_level_0,count
1.1,Unnamed: 1_level_1
2,128
1,22


In [11]:
#This code identifies, counts, and sorts missing values across columns and displays only those with incomplete data for targeted cleaning
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing>0]


Unnamed: 0,0


In [24]:
#This code iterates through all columns, identifies the number of unique values per column,
# and displays frequency distributions for columns with fewer than 30 unique values to analyze categorical features
for col in df.columns:
    (col, "->", df[col].nunique(), "unique values")
    if df[col].nunique() < 30:
        (df[col].value_counts())
        ()


In [28]:
#This code generates a textual summary of the dataset (shape, columns, missing values,
#and target class distribution) and saves it to a file named data_summary.txt
summary_text = []
summary_text.append(f"Shape: {df.shape}")
summary_text.append("Columns: " + ", ".join(df.columns))
summary_text.append("Missing values:\n" + str(missing[missing>0].to_dict()))
summary_text.append("Target class counts:\n" + str(df['3.1'].value_counts().to_dict()))

with open('data_summary.txt','w') as f:
    f.write("\\n\\n".join(summary_text))


## Step 2: Data Cleaning & Preparation

In [30]:
#This code imports necessary machine-learning libraries, loads the Teaching Assistant dataset into a DataFrame,
# and displays the first few rows to examine the structure of the data.

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("teaching_assistant_data.csv")

df.head()


Unnamed: 0,1,23,3,1.1,19,3.1
0,2,15,3,1,17,3
1,1,23,3,2,49,3
2,1,5,2,2,33,3
3,2,7,11,2,55,3
4,2,23,3,1,20,3


In [32]:
#This code renames dataset columns with meaningful labels
#and displays the first five records to verify the updated structure.

df.columns = [
    "Native_teacher",    # column 0
    "Instructor",        # column 1
    "Course",            # column 2
    "Semester",          # column 3
    "Class size",        # column 4
    "Class attribute"    # column 5
]

df.head()


Unnamed: 0,Native_teacher,Instructor,Course,Semester,Class size,Class attribute
0,2,15,3,1,17,3
1,1,23,3,2,49,3
2,1,5,2,2,33,3
3,2,7,11,2,55,3
4,2,23,3,1,20,3


In [33]:
#This code fills missing numerical values using the median and
#categorical values using the mode, then verifies that all missing data has been successfully handled

df['Class size'] = df['Class size'].fillna(df['Class size'].median())

df['Instructor'] = df['Instructor'].fillna(df['Instructor'].mode()[0])
df['Course'] = df['Course'].fillna(df['Course'].mode()[0])
df['Semester'] = df['Semester'].fillna(df['Semester'].mode()[0])
df['Native_teacher'] = df['Native_teacher'].fillna(df['Native_teacher'].mode()[0])

df.isnull().sum()


Unnamed: 0,0
Native_teacher,0
Instructor,0
Course,0
Semester,0
Class size,0
Class attribute,0


In [34]:
#This code applies Label Encoding to convert categorical columns (Instructor and Course)
#into numerical values and displays the updated dataset for verification.

le = LabelEncoder()

df['Instructor'] = le.fit_transform(df['Instructor'])
df['Course'] = le.fit_transform(df['Course'])


df.head()


Unnamed: 0,Native_teacher,Instructor,Course,Semester,Class size,Class attribute
0,2,14,2,1,17,3
1,1,22,2,2,49,3
2,1,4,1,2,33,3
3,2,6,10,2,55,3
4,2,22,2,1,20,3


In [35]:
#The code uses StandardScaler to standardize the Class size column, giving it a mean of 0 and standard deviation of 1.
#This ensures the feature is scaled properly for analysis or machine learning models.

scaler = StandardScaler()
df['Class size'] = scaler.fit_transform(df[['Class size']])

df.head()


Unnamed: 0,Native_teacher,Instructor,Course,Semester,Class size,Class attribute
0,2,14,2,1,-0.848787,3
1,1,22,2,2,1.636983,3
2,1,4,1,2,0.394098,3
3,2,6,10,2,2.103064,3
4,2,22,2,1,-0.615746,3


In [36]:
#The code splits the dataset into features (X) and target (y) and then divides it into training (80%) and testing (20%) sets.
#This prepares the data for building and evaluating a machine learning model

X = df.drop('Class attribute', axis=1)
y = df['Class attribute']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

("Training Data Shape:", X_train.shape)
("Testing Data Shape:", X_test.shape)


('Testing Data Shape:', (30, 5))