In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [15]:
# Load the dataset
url = 'https://raw.githubusercontent.com/dsrscientist/dataset4/main/Grades.csv'
df = pd.read_csv(url)

In [18]:
# Print the first few rows of the dataset
print(df.head())

   Seat No. PH-121 HS-101 CY-105 HS-105/12 MT-111 CS-105 CS-106 EL-102 EE-119  \
0  CS-97001     B-     D+     C-         C     C-     D+      D     C-     B-   
1  CS-97002      A      D     D+         D     B-      C      D      A     D+   
2  CS-97003      A      B      A        B-     B+      A     B-     B+     A-   
3  CS-97004      D     C+     D+         D      D     A-     D+     C-      D   
4  CS-97005     A-     A-     A-        B+      A      A     A-     B+      A   

   ... CS-312 CS-317 CS-403 CS-421 CS-406 CS-414 CS-419 CS-423 CS-412   CGPA  
0  ...     C-     C-     C-     C-     A-      A     C-      B     A-  2.205  
1  ...     D+      D      C      D     A-     B-      C      C      B  2.008  
2  ...      B      B      A      C      A      A      A     A-      A  3.608  
3  ...     D+      C     D+     C-     B-      B     C+     C+     C+  1.906  
4  ...     B-     B+     B+     B-     A-      A     A-     A-      A  3.448  

[5 rows x 43 columns]


In [20]:
# Check the shape of the dataset
print('Shape:', df.shape)

Shape: (571, 43)


In [21]:
# Check the data types of the columns
print(df.dtypes)

Seat No.      object
PH-121        object
HS-101        object
CY-105        object
HS-105/12     object
MT-111        object
CS-105        object
CS-106        object
EL-102        object
EE-119        object
ME-107        object
CS-107        object
HS-205/20     object
MT-222        object
EE-222        object
MT-224        object
CS-210        object
CS-211        object
CS-203        object
CS-214        object
EE-217        object
CS-212        object
CS-215        object
MT-331        object
EF-303        object
HS-304        object
CS-301        object
CS-302        object
TC-383        object
MT-442        object
EL-332        object
CS-318        object
CS-306        object
CS-312        object
CS-317        object
CS-403        object
CS-421        object
CS-406        object
CS-414        object
CS-419        object
CS-423        object
CS-412        object
CGPA         float64
dtype: object


In [22]:

print('Missing values:', df.isna().sum().sum())

Missing values: 425


In [25]:
X = df.iloc[:, 1:-1] 
y = df['CGPA']

In [26]:
X = df.iloc[:, 1:]
y = df['CGPA']

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
if 'A' in X_train.columns:
    X_train_encoded = pd.get_dummies(X_train, columns=['A'])
else:
    # handle the case where column 'A' is not present in X_train
    X_train_encoded = X_train.copy()



In [55]:
X_test_encoded = pd.get_dummies(X_test)


In [56]:
from sklearn.preprocessing import LabelEncoder

In [57]:
# Assuming X_train and y_train are the training data
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)



In [84]:

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the training data
encoder.fit(y_train)

# Transform the test data using the fitted encoder
y_test_encoded = encoder.transform(y_test)

# Inverse transform the encoded labels to original labels
y_test_decoded = encoder.inverse_transform(y_test_encoded)

# Check for unseen labels in y_test
unseen_labels = set(y_test) - set(y_test_decoded)
if unseen_labels:
    y_test_decoded = [label for label in y_test_decoded if label not in unseen_labels]


In [87]:
# Check that the column name exists in the dataframe
if 'category' in X_train.columns:
    # Fit the encoder on the training data
    encoder.fit(X_train['category'])

    # Transform the test data using the fitted encoder
    X_test_encoded = encoder.transform(X_test['category'])

    # Get a list of all the classes used for encoding
    classes = list(encoder.classes_)

    # Check for unseen labels in the test set
    unseen_labels = set(y_test) - set(classes)

    # Remove the unseen labels from y_test
    if unseen_labels:
        y_test = [label for label in y_test if label not in unseen_labels]
else:
    print("Column 'category' does not exist in X_train.")




Column 'category' does not exist in X_train.


In [66]:
mask = np.isin(y_test_encoded, np.unique(y_train_encoded))
idx = np.where(mask)[0]
y_test_encoded = y_test_encoded[idx]
X_test = X_test.iloc[idx, :]


In [69]:
# create the LabelEncoder object
encoder = LabelEncoder()




In [74]:
## check if the 'category' column exists in X_test
if 'category' in X_test.columns:
    X_test['category'] = X_test['category'].astype('category')
    X_test['category'] = X_test['category'].cat.set_categories(encoder.classes_)
else:
    # handle the case when the 'category' column is not present in X_test
    print("The 'category' column does not exist in X_test")


The 'category' column does not exist in X_test


In [90]:
print(X_train.columns)


Index(['PH-121', 'HS-101', 'CY-105', 'HS-105/12', 'MT-111', 'CS-105', 'CS-106',
       'EL-102', 'EE-119', 'ME-107', 'CS-107', 'HS-205/20', 'MT-222', 'EE-222',
       'MT-224', 'CS-210', 'CS-211', 'CS-203', 'CS-214', 'EE-217', 'CS-212',
       'CS-215', 'MT-331', 'EF-303', 'HS-304', 'CS-301', 'CS-302', 'TC-383',
       'MT-442', 'EL-332', 'CS-318', 'CS-306', 'CS-312', 'CS-317', 'CS-403',
       'CS-421', 'CS-406', 'CS-414', 'CS-419', 'CS-423', 'CS-412', 'CGPA'],
      dtype='object')
