In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [65]:
df = pd.read_csv('student-scores.csv')
df.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [66]:
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=[np.number]).columns.drop('math_score')

In [67]:
X = df.drop('math_score', axis=1)
y = df['math_score']

In [68]:
numerical_transformer = StandardScaler()

In [69]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

X = preprocessor.fit_transform(X)

# Spliting data into training and testing sets

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Model: {model.__class__.__name__}')
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    print('-'*30)

# Linear Regression

In [72]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
evaluate_model(linear_reg, X_train, X_test, y_train, y_test)

Model: LinearRegression
Mean Squared Error: 166.0882212955052
R^2 Score: -0.0008618962819777476
------------------------------


# Ridge Regression

In [73]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1.0)
evaluate_model(ridge_reg, X_train, X_test, y_train, y_test)


Model: Ridge
Mean Squared Error: 151.14396489001803
R^2 Score: 0.0891934772288755
------------------------------


# Lasso Regression

In [74]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=0.1)
evaluate_model(lasso_reg, X_train, X_test, y_train, y_test)


Model: Lasso
Mean Squared Error: 125.59509226679944
R^2 Score: 0.243153179500871
------------------------------


# ElasticNet Regression

In [75]:
from sklearn.linear_model import ElasticNet

elastic_net_reg = ElasticNet(alpha=0.1, l1_ratio=0.5)
evaluate_model(elastic_net_reg, X_train, X_test, y_train, y_test)


Model: ElasticNet
Mean Squared Error: 125.747757517126
R^2 Score: 0.24223320558130956
------------------------------


# Decision Tree Regression

In [76]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
evaluate_model(tree_reg, X_train, X_test, y_train, y_test)


Model: DecisionTreeRegressor
Mean Squared Error: 239.04
R^2 Score: -0.4404755847290094
------------------------------


# Random Forest Regression

In [77]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
evaluate_model(forest_reg, X_train, X_test, y_train, y_test)


Model: RandomForestRegressor
Mean Squared Error: 141.584788
R^2 Score: 0.14679789874902605
------------------------------


#  Support Vector Regression (SVR)

In [78]:
from sklearn.svm import SVR

svr_reg = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
evaluate_model(svr_reg, X_train, X_test, y_train, y_test)


Model: SVR
Mean Squared Error: 144.09719332488498
R^2 Score: 0.1316579283280086
------------------------------


# K-Nearest Neighbors Regression

In [79]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor(n_neighbors=5)
evaluate_model(knn_reg, X_train, X_test, y_train, y_test)


Model: KNeighborsRegressor
Mean Squared Error: 164.8129
R^2 Score: 0.006823299454552512
------------------------------


# clustering

In [80]:
df.columns

Index(['id', 'first_name', 'last_name', 'email', 'gender', 'part_time_job',
       'absence_days', 'extracurricular_activities', 'weekly_self_study_hours',
       'career_aspiration', 'math_score', 'history_score', 'physics_score',
       'chemistry_score', 'biology_score', 'english_score', 'geography_score'],
      dtype='object')

In [81]:
numeric_columns = ['absence_days', 'weekly_self_study_hours', 
                    'math_score', 'history_score', 'physics_score', 
                    'chemistry_score', 'biology_score', 'english_score', 'geography_score']

In [82]:
X_clustering = df[numeric_columns]

In [83]:
scaler = StandardScaler()
X_clustering_transformed = scaler.fit_transform(X_clustering)

# Hierarchical Clustering

In [84]:
from sklearn.cluster import AgglomerativeClustering

agg_clustering = AgglomerativeClustering(n_clusters=3)
agg_clustering.fit(X_clustering_transformed)
print(f'Labels: {agg_clustering.labels_}')


Labels: [0 2 0 ... 0 1 2]


# K-Means Clustering

In [85]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_clustering_transformed)
print(f'Cluster Centers: {kmeans.cluster_centers_}')
print(f'Labels: {kmeans.labels_}')


Cluster Centers: [[ 0.94575144 -1.04336749 -0.98274146 -0.54933653 -0.33511805 -0.38736997
  -0.20363512 -0.46797718  0.02386433]
 [-0.24878994  0.69234716  0.34376462  0.37068439  0.44360669  0.32366183
   0.53876027  0.32684354  0.7608758 ]
 [-0.36434864  0.04436249  0.30565275  0.01808461 -0.17506733 -0.04053413
  -0.33674506  0.0059424  -0.66567281]]
Labels: [1 1 1 ... 2 0 1]


# DBSCAN Clustering

In [86]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_clustering_transformed)
print(f'Labels: {dbscan.labels_}')


Labels: [-1 -1 -1 ... -1 -1 -1]


# Dimensionality Reduction

 ## PCA

In [87]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_clustering_transformed)
print(f'PCA Explained Variance Ratio: {pca.explained_variance_ratio_}')
print(f'PCA Results: {X_pca[:5]}')


PCA Explained Variance Ratio: [0.2398923  0.12012718]
PCA Results: [[-0.5623188  -0.50218792]
 [-3.09130846 -0.8171628 ]
 [ 0.07186369 -1.45479508]
 [ 1.37674374 -1.64928491]
 [ 1.5065839   0.62711889]]


## t-SNE

In [88]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_clustering_transformed)
print(f't-SNE Results: {X_tsne[:5]}')


t-SNE Results: [[-26.970043  -17.966206 ]
 [-61.66499    -3.337677 ]
 [ 40.525494  -14.22424  ]
 [ 29.499487   14.863927 ]
 [ 40.270607    1.1094366]]


# classification

In [89]:
features = ['absence_days', 'weekly_self_study_hours', 'math_score', 
            'history_score', 'physics_score', 'chemistry_score', 
            'biology_score', 'english_score', 'geography_score']
target = 'part_time_job'

In [90]:
label_encoder = LabelEncoder()
df[target] = label_encoder.fit_transform(df[target])

In [91]:
X = df[features]
y = df[target]

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression - Accuracy: {accuracy:.2f}')


Logistic Regression - Accuracy: 0.85


# Random Forest Classifier

In [95]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest - Accuracy: {accuracy:.2f}')


Random Forest - Accuracy: 0.84


# K-Nearest Neighbors Classifier

In [96]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'K-Nearest Neighbors - Accuracy: {accuracy:.2f}')


K-Nearest Neighbors - Accuracy: 0.83
