In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Configure Seaborn for better visuals
sns.set(style="whitegrid")


In [None]:
data = pd.read_csv('r20.csv') 

In [None]:
data = data[['ID','NAME','BRANCH', 'CGPA', 'RANK', 'E1SEM1', 'E1SEM2', 'E2SEM1', 'E2SEM2']]

In [None]:
data = data[data['BRANCH']=="CSE"]

In [None]:
data = data.dropna()

In [None]:
data['BRANCH'] = pd.factorize(data['BRANCH'])[0]

In [None]:
data.head()

In [None]:
data['Average_Past_Marks'] = data[['E1SEM1', 'E1SEM2', 'E2SEM1']].mean(axis=1)

data['Change_E1SEM1_E1SEM2'] = data['E1SEM2'] - data['E1SEM1']
data['Change_E2SEM1_E2SEM2'] = data['E2SEM2'] - data['E2SEM1']

data.head()


In [None]:
# Define features (X) and target (y)
features = data[['E1SEM1', 'E1SEM2', 'E2SEM1', 'Average_Past_Marks', 'Change_E1SEM1_E1SEM2']]
target = data['E2SEM2']


In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize GridSearchCV with Gradient Boosting
grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best MAE:", -grid_search.best_score_)


In [None]:
# Make predictions on the test data
predictions = model.predict(X_test)

# Evaluate the model with Mean Absolute Error
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")


In [None]:
# Create a DataFrame to compare actual and predicted marks
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

# Plotting the comparison
plt.figure(figsize=(10, 6))
sns.scatterplot(data=comparison, x='Actual', y='Predicted')
plt.plot([min(comparison['Actual']), max(comparison['Actual'])],
         [min(comparison['Actual']), max(comparison['Actual'])],
         color='red', lw=2)  # Perfect prediction line
plt.title('Actual vs. Predicted Marks')
plt.xlabel('Actual Marks')
plt.ylabel('Predicted Marks')
plt.show()


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Load the Data
# Assuming your data is in a CSV file named 'data.csv'
data = pd.read_csv('r20.csv')

# Step 2: Data Selection
# Select relevant columns
selected_features = ['E1SEM1', 'E1SEM2', 'E2SEM1', 'E2SEM2', 'CGPA']  # Example features
target_column = 'E2SEM2'  # Adjust based on actual target column name
id_column = 'ID'  # Adjust based on actual ID column name
name_column = 'NAME'  # Adjust based on actual NAME column name

features = data[selected_features]
target = data[target_column]

# Step 3: Handle Missing Data (Optional)
# Drop rows with null values in selected features or target
features.dropna(inplace=True)
target = target[features.index]  # Keep target aligned with features

# Step 4: Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(features)

# Step 5: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_poly, target, test_size=0.2, random_state=42)

# Step 6: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 7: Model Selection - Gradient Boosting Regressor with Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_scaled, y_train)

# Print best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best MAE from Grid Search:", -grid_search.best_score_)

# Best model after tuning
best_model = grid_search.best_estimator_

# Step 8: Define a Prediction Function
def predict_next_semester_marks(student_id=None, student_name=None):
    # Find the student row
    if student_id:
        student_row = data[data[id_column] == student_id]
    elif student_name:
        student_row = data[data[name_column] == student_name]
    else:
        raise ValueError("Please provide either a student_id or student_name.")
    
    if student_row.empty:
        return "Student not found."
    
    # Extract features for the student
    student_features = student_row[selected_features]
    
    # Handle missing data if necessary
    if student_features.isnull().values.any():
        return "Student data contains missing values."
    
    # Apply polynomial features and scaling
    student_poly = poly.transform(student_features)
    student_scaled = scaler.transform(student_poly)
    
    # Predict next semester marks
    prediction = best_model.predict(student_scaled)
    return prediction[0]  # Return the prediction

# Example Usage
student_id = 'R200589'  # Replace with an actual ID
predicted_marks = predict_next_semester_marks(student_id=student_id)
print(f"Predicted Next Semester Marks for Student ID {student_id}: {predicted_marks}")

# Step 9: Evaluate the Best Model on Test Set
y_pred = best_model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Data: {mae}")

# Step 10: Cross-Validation for Robust Evaluation
cv_scores = cross_val_score(best_model, X_poly, target, cv=5, scoring='neg_mean_absolute_error')
print("Cross-validated MAE:", -cv_scores.mean())

# Step 11: Visualize Results
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Next Semester Marks')
plt.ylabel('Predicted Next Semester Marks')
plt.title('Actual vs Predicted Next Semester Marks')
plt.show()


In [None]:
student_id = 'R200256'  # Replace with an actual ID

print("Real results are - >", data.loc[data["ID"] == student_id, "E2SEM2"].values[0])
predicted_marks = predict_next_semester_marks(student_id=student_id)
print(f"Predicted Next Semester Marks for Student ID {student_id}: {predicted_marks}")

# Step 9: Evaluate the Best Model on Test Set
y_pred = best_model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error on Test Data: {mae}")

# Step 10: Cross-Validation for Robust Evaluation
# cv_scores = cross_val_score(best_model, X_poly, target, cv=5, scoring='neg_mean_absolute_error')
# print("Cross-validated MAE:", -cv_scores.mean())

# # Step 11: Visualize Results
# plt.figure(figsize=(10, 6))
# plt.scatter(y_test, y_pred, color='blue', alpha=0.5)
# plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
# plt.xlabel('Actual Next Semester Marks')
# plt.ylabel('Predicted Next Semester Marks')
# plt.title('Actual vs Predicted Next Semester Marks')
# plt.show()

In [None]:
# Import required libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
temp_data = pd.read_csv("r20.csv")

In [None]:
# Sample dataset - Replace with your actual data
data = pd.read_csv("r20.csv")
data.dropna(inplace=True)

In [None]:
# Set up features (previous semesters) and target (next semester: E3SEM1)
features = ['E1SEM1', 'E1SEM2', 'E2SEM1']
# Create a new column for E3SEM1 as the mean of E2SEM2 (for demonstration purposes)
# You should replace this with actual E3SEM1 values if available
data['E3SEM1'] = data['E2SEM1'] + 0.2  # Simulate E3SEM1 as a slight increment

# Splitting data into features (X) and target (y)
X = data[features]
y = data['E3SEM1']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Predict on the test set and calculate error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

In [None]:
# Predicting E3SEM1 for a specific student (e.g., student_id = 'R200259')
student_id = 'R200'
student_data = data.loc[data['ID'] == student_id, features]
predicted_e3sem1 = model.predict(student_data)[0]
print(f"Predicted E3SEM1 for student ID {student_id}: {predicted_e3sem1}")
print("Actual E3s1 data from data" , temp_data["E2S2"])