In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
joebeachcapital_gpa_and_iq_path = kagglehub.dataset_download('joebeachcapital/gpa-and-iq')
joebeachcapital_gpa_study_hours_path = kagglehub.dataset_download('joebeachcapital/gpa-study-hours')

print('Data source import complete.')


# **WELOME -- UPVOTE IF YOU ENJOY!**

# **ABSTRACT**

In this study, we performed an extensive Exploratory Data Analysis (EDA) to understand the factors affecting student GPAs, specifically focusing on IQ and study hours. We started by merging two distinct datasets—one containing information about student GPAs and IQ, and the other detailing GPAs and study hours. Our EDA included data cleaning to handle missing values, duplicate rows, and constant features. We also explored the distributions of the variables and their inter-correlations through visualizations. A linear regression model was then trained to predict GPA based on IQ and study hours, which was evaluated using Root Mean Square Error (RMSE) and cross-validation. However, we encountered an issue with an unusually low RMSE, indicating a potential problem that warrants further investigation. Despite this, the model was implemented in a user-friendly application that allows individuals to input their IQ and study hours to predict their GPA. This serves as a foundation for future work to improve the predictive model and understand student performance better.

# **WHY WAS THIS MADE?**

This study was developed in light of rising academic competition and its impact on mental well-being. As GPA becomes a focal point for future opportunities, understanding its predictors like IQ and study hours is crucial. This predictive model aims to offer insights that can guide students and educators, thereby alleviating academic stress and fostering a balanced approach to success.

# **IMPORTING LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


# **LOAD DATA**

In [None]:
df1 = pd.read_csv('/kaggle/input/gpa-and-iq/gpa_iq.csv')  # dataset containing GPA and IQ
df2 = pd.read_csv('/kaggle/input/gpa-study-hours/gpa_study_hours.csv')  # dataset containing GPA and study hours

# **DATA PREPROCESSING**

# MERGING THE DATA

# Here we need to identify a varibale based on which to merge the data, in this case, GPA.

In [None]:
print("Columns in df1: ", df1.columns)
print("Columns in df2: ", df2.columns)

In [None]:
merged_df = pd.merge(df1, df2, on='gpa')

# **Preprocess Merged Data**

# This code checks a merged DataFrame for null values, duplicate rows, and constant features, printing warnings and cleaning the data accordingly.

In [None]:
# Check for Null Values
if merged_df.isnull().sum().any():
    print("Warning: Null values found")
    print(merged_df.isnull().sum())

# Remove Duplicate Rows
duplicate_count = merged_df.duplicated().sum()
if duplicate_count > 0:
    print(f"Warning: {duplicate_count} duplicate rows found")
    merged_df.drop_duplicates(inplace=True)

# Remove Constant Features
constant_columns = [col for col in merged_df.columns if merged_df[col].nunique() == 1]
if constant_columns:
    print(f"Warning: Constant columns found: {constant_columns}")
    merged_df.drop(columns=constant_columns, inplace=True)


# **📈EDA OF THE DATA📈**

# This code generates and displays a correlation matrix and heatmap for GPA, IQ, and study hours. It also creates scatter plots to visualize individual correlations between these variables.

In [None]:
# Generate the correlation matrix
correlation_matrix = merged_df[['gpa', 'iq', 'study_hours']].corr()

# Print out the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Generate a heatmap for the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Generate scatter plots for individual correlations

# Scatter plot for GPA and IQ
sns.scatterplot(data=merged_df, x='iq', y='gpa')
plt.title('Scatter plot of GPA and IQ')
plt.xlabel('IQ')
plt.ylabel('GPA')
plt.show()

# Scatter plot for GPA and Study Hours
sns.scatterplot(data=merged_df, x='study_hours', y='gpa')
plt.title('Scatter plot of GPA and Study Hours')
plt.xlabel('Study Hours')
plt.ylabel('GPA')
plt.show()

# Scatter plot for IQ and Study Hours
sns.scatterplot(data=merged_df, x='study_hours', y='iq')
plt.title('Scatter plot of IQ and Study Hours')
plt.xlabel('Study Hours')
plt.ylabel('IQ')
plt.show()

# Scatter plot to show all correlations
sns.pairplot(merged_df[['gpa', 'iq', 'study_hours']])
plt.title('Pairplot of GPA, IQ, and Study Hours')
plt.show()


# **DATA SPLITTING + MODEL TRAINING**

# This code splits the data into training and test sets, trains a linear regression model, and evaluates it using cross-validation and Root Mean Square Error (RMSE) for both training and test data.

In [None]:
from sklearn.model_selection import cross_val_score

# Split data
X = merged_df[['iq', 'study_hours']]
y = merged_df['gpa']

# Try a different random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print("Cross-validation RMSE scores:", cv_rmse)
print("Mean CV RMSE:", np.mean(cv_rmse))

# Predict and evaluate
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("Train RMSE:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_pred)))


# **PREDICT YOUR OWN GPA BASED ON IQ AND STUDY HOURS**

# This code defines a function to predict GPA based on user-inputted IQ and study hours using a trained linear regression model. It handles invalid inputs gracefully.

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Assume model is your trained model
model = LinearRegression()
# Replace X_train and y_train with your actual training data
model.fit(X_train, y_train)

def predict_gpa(iq, study_hours):
    # Prepare the feature vector for prediction
    features = np.array([[iq, study_hours]])

    # Use the trained model to make the prediction
    predicted_gpa = model.predict(features)

    return predicted_gpa[0]

# Define user IQ and study hours directly in the code
user_iq = 110  # Replace with the desired IQ
user_study_hours = 20  # Replace with the desired number of study hours

# Make prediction
try:
    predicted_gpa = predict_gpa(user_iq, user_study_hours)

    print(f"Based on the provided IQ of {user_iq} and weekly study hours of {user_study_hours}, the predicted GPA is {predicted_gpa:.2f}.")

except ValueError:
    print("Invalid input. Please enter numerical values for IQ and study hours.")
