In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor  # Since 'Assessment score' is numerical
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
# Load the dataset
data = pd.read_csv('k12.csv')  # Replace 'filename.csv' with your file

# Display the first few rows
print(data.head())

# Display dataset information
print(data.info())

           Name  Gender    Phone Number  Age    Indian City    Indian State  \
0  VivaanSharma    Male  +91 6238846143    5           Agra   Uttar Pradesh   
1   KiaraSharma  Female  +91 8385681260   11      Hyderabad       Telangana   
2      MyraKhan  Female  +91 8372960268   10  Visakhapatnam  Andhra Pradesh   
3    RohanVerma    Male  +91 8403856921   14    Bhubaneswar          Odisha   
4     SiyaSingh  Female  +91 9904933545    4      Hyderabad       Telangana   

                    Track  Time Used Per Day (hrs)  Speed of Learning (1-10)  \
0                 History                     6.14                      2.80   
1                Politics                     7.26                      4.35   
2             Mathematics                     6.15                      2.49   
3  Basic Coding Languages                     3.60                      3.33   
4                 Science                     1.06                      1.59   

   Assessment Score  Historical Assessment A

In [3]:
# Drop duplicate rows (if any)
data.drop_duplicates(inplace=True)

# Fill missing values
for column in data.columns:
    if data[column].dtype == 'object':  # For categorical data
        data[column].fillna('Unknown', inplace=True)
    else:  # For numerical data
        data[column].fillna(data[column].mean(), inplace=True)

In [4]:
label_encoders = {}
for column in ['Gender', 'Indian City', 'Indian State', 'Track']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le  # Store encoders for later use

In [5]:
scaler = StandardScaler()
numerical_columns = ['Age', 'Time Used Per Day (hrs)', 'Speed of Learning (1-10)', 'Historical Assessment Average']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [6]:
# Define features (X) and target variable (y)
X = data.drop(['Name', 'Phone Number', 'Assessment Score'], axis=1)  # Drop non-relevant columns
y = data['Assessment Score']

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
print(data.dtypes)

Name                              object
Gender                             int32
Phone Number                      object
Age                              float64
Indian City                        int32
Indian State                       int32
Track                              int32
Time Used Per Day (hrs)          float64
Speed of Learning (1-10)         float64
Assessment Score                   int64
Historical Assessment Average    float64
dtype: object


In [8]:
# Identify all object (string) columns
categorical_columns = data.select_dtypes(include='object').columns

# Encode them
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))  # Ensure all are strings before encoding
    label_encoders[column] = le


In [9]:
print(data.dtypes)

Name                               int32
Gender                             int32
Phone Number                       int32
Age                              float64
Indian City                        int32
Indian State                       int32
Track                              int32
Time Used Per Day (hrs)          float64
Speed of Learning (1-10)         float64
Assessment Score                   int64
Historical Assessment Average    float64
dtype: object


In [10]:
# Identify non-numeric columns
non_numeric_columns = data.select_dtypes(include='object').columns

# Display unique values in each non-numeric column
for col in non_numeric_columns:
    print(f"Column: {col}")
    print(data[col].unique())
    print("-" * 40)


In [11]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns properly
label_encoders = {}
for col in non_numeric_columns:
    data[col].fillna('Unknown', inplace=True)  # Handle NaN values
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Ensure proper encoding
    label_encoders[col] = le


In [12]:
print(data.dtypes)

Name                               int32
Gender                             int32
Phone Number                       int32
Age                              float64
Indian City                        int32
Indian State                       int32
Track                              int32
Time Used Per Day (hrs)          float64
Speed of Learning (1-10)         float64
Assessment Score                   int64
Historical Assessment Average    float64
dtype: object


In [13]:
# Identify rows that are causing the issue
problem_rows = X_train[~X_train.map(lambda x: pd.to_numeric(x, errors='coerce')).notna().all(axis=1)]

print("Problematic Rows Found:")
print(problem_rows)


Problematic Rows Found:
Empty DataFrame
Columns: [Gender, Age, Indian City, Indian State, Track, Time Used Per Day (hrs), Speed of Learning (1-10), Historical Assessment Average]
Index: []


In [14]:
# Encoding categorical columns again (if needed)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].astype(str).fillna('Unknown')
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [15]:
# Identify all object (string) columns
categorical_columns = data.select_dtypes(include='object').columns
print("Categorical columns:", categorical_columns)

Categorical columns: Index([], dtype='object')


In [16]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoders = {}
for col in categorical_columns:
    data[col] = data[col].astype(str).fillna('Unknown')  # Ensure no NaN values
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])  # Encode the values
    label_encoders[col] = le  # Save encoder for inverse transformation if needed

In [17]:
print(data.head())
print(data.dtypes)

   Name  Gender  Phone Number       Age  Indian City  Indian State  Track  \
0   178       1            31 -1.392027            0            15      3   
1   107       0           308 -0.354234            8            14      7   
2   113       0           307 -0.527199           30             0      5   
3   154       1           311  0.164663            3            10      0   
4   170       0           490 -1.564993            8            14      8   

   Time Used Per Day (hrs)  Speed of Learning (1-10)  Assessment Score  \
0                 0.800455                 -0.924008                83   
1                 1.347420                 -0.342946                65   
2                 0.805338                 -1.040221                96   
3                -0.439985                 -0.725323                86   
4                -1.680424                 -1.377612                87   

   Historical Assessment Average  
0                      -0.776040  
1                     

In [30]:
# Find non-numeric values in X_train
non_numeric_values = X_train.applymap(lambda x: isinstance(x, str))

# Show rows with string values
problem_rows = X_train[non_numeric_values.any(axis=1)]
print("Problematic Rows Found:")
print(problem_rows)


Problematic Rows Found:
Empty DataFrame
Columns: [Gender, Age, Indian City, Indian State, Track, Time Used Per Day (hrs), Speed of Learning (1-10), Historical Assessment Average]
Index: []


  non_numeric_values = X_train.applymap(lambda x: isinstance(x, str))


In [18]:
# Find non-numeric values in X_train using map()
non_numeric_values = X_train.map(lambda x: isinstance(x, str))

# Show rows with string values
problem_rows = X_train[non_numeric_values.any(axis=1)]
print("Problematic Rows Found:")
print(problem_rows)


Problematic Rows Found:
Empty DataFrame
Columns: [Gender, Age, Indian City, Indian State, Track, Time Used Per Day (hrs), Speed of Learning (1-10), Historical Assessment Average]
Index: []


In [19]:
# Identify columns with non-numeric values
categorical_columns = problem_rows.columns[non_numeric_values.any()]
print("Categorical Columns Found:", categorical_columns)

Categorical Columns Found: Index([], dtype='object')


In [20]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
label_encoders = {}
for col in categorical_columns:
    X_train[col] = X_train[col].astype(str).fillna('Unknown')  # Fill missing text data
    X_test[col] = X_test[col].astype(str).fillna('Unknown')    # Same for test data
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Ensure both sets are encoded the same way
    label_encoders[col] = le  # Save encoders for future reference

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = ['Age', 'Time Used Per Day (hrs)', 'Speed of Learning (1-10)', 'Historical Assessment Average']

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])


In [22]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

In [23]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluation metrics
print("Training Data Performance:")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_train, y_train_pred):.2f}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_train, y_train_pred):.2f}")
print(f"R-squared (R²): {r2_score(y_train, y_train_pred):.2f}")

print("\nTest Data Performance:")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_test_pred):.2f}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_test_pred):.2f}")
print(f"R-squared (R²): {r2_score(y_test, y_test_pred):.2f}")


Training Data Performance:
Mean Squared Error (MSE): 0.00
Mean Absolute Error (MAE): 0.00
R-squared (R²): 1.00

Test Data Performance:
Mean Squared Error (MSE): 259.64
Mean Absolute Error (MAE): 12.62
R-squared (R²): -0.20


In [24]:
print("X_train Columns:", X_train.columns)
print("new_data Columns:", new_data.columns)

X_train Columns: Index(['Gender', 'Age', 'Indian City', 'Indian State', 'Track',
       'Time Used Per Day (hrs)', 'Speed of Learning (1-10)',
       'Historical Assessment Average'],
      dtype='object')


NameError: name 'new_data' is not defined

In [38]:
new_data = new_data[X_train.columns]

NameError: name 'new_data' is not defined

In [66]:
# Encode categorical data
for col in label_encoders:
    if col in new_data.columns:
        new_data[col] = new_data[col].map(lambda x: label_encoders[col].classes_.tolist().index(x) 
                                          if x in label_encoders[col].classes_ 
                                          else -1)  # Assign -1 for unseen values

In [74]:
print(label_encoders.keys())


dict_keys(['Indian City', 'Indian State', 'Track'])


In [77]:
print(X_train.columns)

Index(['Gender', 'Age', 'Indian City', 'Indian State', 'Track',
       'Time Used Per Day (hrs)', 'Speed of Learning (1-10)',
       'Historical Assessment Average'],
      dtype='object')


In [79]:
# Ensure new_data has the same columns in the same order as X_train
new_data = new_data[X_train.columns]

In [81]:
print(new_data.dtypes)


Gender                            object
Age                              float64
Indian City                       object
Indian State                      object
Track                             object
Time Used Per Day (hrs)          float64
Speed of Learning (1-10)         float64
Historical Assessment Average    float64
dtype: object


In [83]:
# Encode categorical columns
for col in ['Gender', 'Indian City', 'Indian State', 'Track']:
    if col in new_data.columns:
        new_data[col] = new_data[col].map(lambda x: label_encoders[col].classes_.tolist().index(x)
                                          if x in label_encoders[col].classes_ 
                                          else -1)

In [84]:
print(new_data.dtypes)

Gender                             int64
Age                              float64
Indian City                        int64
Indian State                       int64
Track                              int64
Time Used Per Day (hrs)          float64
Speed of Learning (1-10)         float64
Historical Assessment Average    float64
dtype: object


In [85]:
new_data = new_data[X_train.columns]


In [87]:
new_data[numerical_columns] = scaler.transform(new_data[numerical_columns])


In [88]:
prediction = model.predict(new_data)
print("Predicted Value:", prediction[0])

Predicted Value: 97.0


In [90]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluation metrics
print("Training Data Performance:")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_train, y_train_pred):.2f}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_train, y_train_pred):.2f}")
print(f"R-squared (R²): {r2_score(y_train, y_train_pred):.2f}")

print("\nTest Data Performance:")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_test_pred):.2f}")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_test_pred):.2f}")
print(f"R-squared (R²): {r2_score(y_test, y_test_pred):.2f}")


Training Data Performance:
Mean Squared Error (MSE): 0.00
Mean Absolute Error (MAE): 0.00
R-squared (R²): 1.00

Test Data Performance:
Mean Squared Error (MSE): 280.74
Mean Absolute Error (MAE): 13.10
R-squared (R²): -0.30


In [92]:
print(f"y_train length: {len(y_train)}")
print(f"y_train_pred length: {len(y_train_pred)}")
print(f"y_test length: {len(y_test)}")
print(f"y_test_pred length: {len(y_test_pred)}")

y_train length: 400
y_train_pred length: 400
y_test length: 100
y_test_pred length: 100


In [93]:
# Creating two separate DataFrames
train_results = pd.DataFrame({
    'Actual (Train)': y_train,
    'Predicted (Train)': y_train_pred
})

test_results = pd.DataFrame({
    'Actual (Test)': y_test,
    'Predicted (Test)': y_test_pred
})

# Combining the two with NaN padding where needed
results = pd.concat([train_results, test_results], axis=1)

# Saving to CSV
results.to_csv('model_results.csv', index=False)

print("Results saved successfully!")


Results saved successfully!
