# **with Mutaul information**

In [9]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
import math
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler  # Z-Score Normalization

# Load dataset
df = pd.read_csv('students_responses_main.csv')

# Data cleaning and preprocessing
df.drop(columns=df.columns[88], inplace=True)
df.drop(columns=df.columns[74], inplace=True)
df.drop(columns=df.columns[58], inplace=True)
df.drop(columns=df.columns[12:50], inplace=True)
df.drop(columns=df.columns[10], inplace=True)
df.drop(columns=df.columns[8], inplace=True)
df.drop(columns=df.columns[7], inplace=True)
df.drop(columns=df.columns[0], inplace=True)

# Modify the semester column
df['Kindly choose your current semester.'] = (
    df['Kindly choose your current semester.']
    .str.replace('2 Semester', '1 Semester', regex=True)
    .str.replace('3 Semester', '2 Semester', regex=True)
    .str.replace('4 Semester', '3 Semester', regex=True)
    .str.replace('5 Semester', '4 Semester', regex=True)
    .str.replace('6 Semester', '5 Semester', regex=True)
    .str.replace('7 Semester', '6 Semester', regex=True)
    .str.replace('8 Semester', '7 Semester', regex=True)
)

# Fix the matric and Fsc/Ics marks
df['Your matric marks percentage?'] = np.where(
    df['Your matric marks percentage?'] > 100,
    df['Your matric marks percentage?'] * 100 / 1100,
    df['Your matric marks percentage?'],
)

df["Your Fsc/Ics marks percentage?"] = np.where(
    df["Your Fsc/Ics marks percentage?"] > 100,
    df["Your Fsc/Ics marks percentage?"] * 100 / 1100,
    df["Your Fsc/Ics marks percentage?"],
)

df.round(decimals=2)

# Handle outliers using IQR method
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Factorize categorical columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] + 1
    return data

df = factorize_fun(df)

# Impute missing values using KNN
from sklearn.impute import KNNImputer

def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns=df.columns)
    return df2

df = knn_null(df)

# Split data into features (X) and target (y)
X = df.drop(columns=['Please mention your Previous Semester GPA?'])
y = df['Please mention your Previous Semester GPA?']

def mean_absolute_percentage_error(y_true, y_pred, clip_threshold=1.0):
    # Clip the values in y_true to avoid division by very small numbers (close to 0)
    y_true_clipped = np.clip(y_true, clip_threshold, np.inf)  # Clip values below `clip_threshold`
    return np.mean(np.abs((y_true_clipped - y_pred) / y_true_clipped)) * 100

# Z-Score Normalization
def zscore_normalize(df):
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(df)
    return pd.DataFrame(data_scaled, columns=df.columns)

X = zscore_normalize(X)

# Mutual Information feature selection
def mutual_info(X, y, top_k=10):
    mi = mutual_info_regression(X, y)
    mi_series = pd.Series(mi, index=X.columns)
    top_features = mi_series.nlargest(top_k).index
    print(f"Top {top_k} features based on mutual information: {list(top_features)}")
    return X[top_features]

X_new = mutual_info(X, y, top_k=10)

# Train-test split with selected features
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.1, random_state=4)

# Remaining regression and evaluation steps remain unchanged


# Evaluation Metrics
def evaluate_model(y_test, y_pred):
    print("MAE Score: ", mean_absolute_error(y_test, y_pred))
    print("MSE Score: ", mean_squared_error(y_test, y_pred))
    print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score: %.2f" % r2_score(y_test, y_pred))
    print("MAPE Score: %.2f" % (np.mean(np.abs((y_test - y_pred) / np.clip(y_test, 1e-8, np.inf))) * 100))

# Linear Regression
print("\nOLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

print("\nMultivariable Linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

# Decision Tree Regression
print("\nDecision Tree Regression")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

# Polynomial Regression
print("\nPolynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))
evaluate_model(y_test, y_pred)

# Random Forest Regression
print("\nRandom Forest Regression")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)


Top 10 features based on mutual information: ['Please provide your current CGPA?', 'Kindly mention your age?', 'Your Fsc/Ics marks percentage?', 'Your matric marks percentage?', 'How many members are in your family?', 'Please mention your NTS score?', 'Do you love your subjects?', 'Do you find your CS/SE subjects difficult?', 'How is your health', 'How much you have interest in this domain']

OLS Linear Regression
MAE Score:  2.9291999210718305
MSE Score:  8.66821960098358
RMSE Score:  2.944184029741276
R2 Score: -21.98
MAPE Score: 110.83

Multivariable Linear Regression
MAE Score:  0.1327793213332057
MSE Score:  0.033546699638289386
RMSE Score:  0.18315758143819597
R2 Score: 0.91
MAPE Score: 5.28

Decision Tree Regression
MAE Score:  0.1983103448275862
MSE Score:  0.10061810344827585
RMSE Score:  0.3172035678366116
R2 Score: 0.73
MAPE Score: 8.42

Polynomial Regression
MAE Score:  1.7031923190542837
MSE Score:  11.729977348085061
RMSE Score:  3.424905450970152
R2 Score: -30.10
MAPE Sc

# **with CHI-Square**

In [10]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
import math
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler  # Z-Score Normalization
from sklearn.preprocessing import KBinsDiscretizer


# Load dataset
df = pd.read_csv('students_responses_main.csv')
df_org = df.copy()

# Data cleaning steps
df.drop(columns=df.columns[88], inplace=True)
df.drop(columns=df.columns[74], inplace=True)
df.drop(columns=df.columns[58], inplace=True)
df.drop(columns=df.columns[12:50], inplace=True)
df.drop(columns=df.columns[10], inplace=True)
df.drop(columns=df.columns[8], inplace=True)
df.drop(columns=df.columns[7], inplace=True)
df.drop(columns=df.columns[0], inplace=True)

# Modify the semester column
df['Kindly choose your current semester.'] = (
    df['Kindly choose your current semester.']
    .str.replace('2 Semester', '1 Semester', regex=True)
    .str.replace('3 Semester', '2 Semester', regex=True)
    .str.replace('4 Semester', '3 Semester', regex=True)
    .str.replace('5 Semester', '4 Semester', regex=True)
    .str.replace('6 Semester', '5 Semester', regex=True)
    .str.replace('7 Semester', '6 Semester', regex=True)
    .str.replace('8 Semester', '7 Semester', regex=True)
)

# Fix the matric and Fsc/Ics marks
df['Your matric marks percentage?'] = np.where(
    df['Your matric marks percentage?'] > 100,
    df['Your matric marks percentage?'] * 100 / 1100,
    df['Your matric marks percentage?'],
)

df["Your Fsc/Ics marks percentage?"] = np.where(
    df["Your Fsc/Ics marks percentage?"] > 100,
    df["Your Fsc/Ics marks percentage?"] * 100 / 1100,
    df["Your Fsc/Ics marks percentage?"],
)

df.round(decimals=2)

# Handle outliers using IQR method
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Factorize categorical columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] + 1
    return data

df = factorize_fun(df)

# Impute missing values using KNN
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns=df.columns)
    return df2

df = knn_null(df)

# Split data into features (X) and target (y)
X = df.drop(columns=['Please mention your Previous Semester GPA?'])
y = df['Please mention your Previous Semester GPA?']


def mean_absolute_percentage_error(y_true, y_pred, clip_threshold=1.0):
    # Clip the values in y_true to avoid division by very small numbers (close to 0)
    y_true_clipped = np.clip(y_true, clip_threshold, np.inf)  # Clip values below `clip_threshold`
    return np.mean(np.abs((y_true_clipped - y_pred) / y_true_clipped)) * 100
# Z-Score Normalization
def zscore_normalize(df):
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(df)
    return pd.DataFrame(data_scaled, columns=df.columns)

X = zscore_normalize(X)

# Ensure non-negative values after Z-score normalization
def ensure_non_negative(df):
    min_val = df.min().min()
    if min_val < 0:
        df = df - min_val  # Shift all values to make them non-negative
    return df

# Apply Z-Score normalization and then ensure non-negative values
X = ensure_non_negative(X)

# Discretize the target variable to make it categorical
def discretize_target(y, n_bins=5):
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    y_discretized = discretizer.fit_transform(y.values.reshape(-1, 1))
    return y_discretized.flatten()  # Flatten the array to avoid shape issues

y_discretized = discretize_target(y)

# Chi-Square feature selection
def chi_square_selection(X, y, k=10):
    chi2_selector = SelectKBest(chi2, k=k)
    X_new = chi2_selector.fit_transform(X, y)
    selected_columns = X.columns[chi2_selector.get_support()]
    print(f"Selected features after Chi-Square test: {selected_columns}")
    return X_new, selected_columns

X_new, selected_columns = chi_square_selection(X, y_discretized, k=10)

# Train-test split with selected features
X_train, X_test, y_train, y_test = train_test_split(X_new, y_discretized, test_size=0.1, random_state=4)

def evaluate_model(y_test, y_pred):
    print("MAE Score: ", mean_absolute_error(y_test, y_pred))
    print("MSE Score: ", mean_squared_error(y_test, y_pred))
    print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score: %.2f" % r2_score(y_test, y_pred))
    print("MAPE Score: %.2f" % (np.mean(np.abs((y_test - y_pred) / np.clip(y_test, 1e-8, np.inf))) * 100))

# Linear Regression
print("\nOLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

print("\nMultivariable Linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

# Decision Tree Regression
print("\nDecision Tree Regression")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

# Polynomial Regression
print("\nPolynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))
evaluate_model(y_test, y_pred)

# Random Forest Regression
print("\nRandom Forest Regression")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)


Selected features after Chi-Square test: Index(['Kindly mention your age?', 'Your matric marks percentage?',
       'Your Fsc/Ics marks percentage?', 'Please mention your NTS score?',
       'How much you have interest in this domain',
       'Are you satisfied with your program selection?',
       'Please provide your current CGPA?',
       'Do you find your CS/SE subjects difficult?',
       'Do you love your subjects?', 'Kindly specify do you love travelling?'],
      dtype='object')

OLS Linear Regression
MAE Score:  0.30755634101911494
MSE Score:  0.14801551206184052
RMSE Score:  0.3847278415475549
R2 Score: 0.80
MAPE Score: 47984512.98

Multivariable Linear Regression
MAE Score:  0.3181687270241447
MSE Score:  0.15821095748419747
RMSE Score:  0.3977574103447948
R2 Score: 0.79
MAPE Score: 36753764.21

Decision Tree Regression
MAE Score:  0.27586206896551724
MSE Score:  0.3103448275862069
RMSE Score:  0.5570860145311556
R2 Score: 0.58
MAPE Score: 172413801.72

Polynomial Regression

# **With VIF**

In [11]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler  # Z-score Normalization
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor

# Load dataset
df = pd.read_csv('students_responses_main.csv')
df_org = df.copy()
df.head()

# Data cleaning steps
df.drop(columns=df.columns[88], inplace=True)
df.drop(columns=df.columns[74], inplace=True)
df.drop(columns=df.columns[58], inplace=True)
df.drop(columns=df.columns[12:50], inplace=True)
df.drop(columns=df.columns[10], inplace=True)
df.drop(columns=df.columns[8], inplace=True)
df.drop(columns=df.columns[7], inplace=True)
df.drop(columns=df.columns[0], inplace=True)

# Modify the semester column
df['Kindly choose your current semester.'] = (
    df['Kindly choose your current semester.']
    .str.replace('2 Semester', '1 Semester', regex=True)
    .str.replace('3 Semester', '2 Semester', regex=True)
    .str.replace('4 Semester', '3 Semester', regex=True)
    .str.replace('5 Semester', '4 Semester', regex=True)
    .str.replace('6 Semester', '5 Semester', regex=True)
    .str.replace('7 Semester', '6 Semester', regex=True)
    .str.replace('8 Semester', '7 Semester', regex=True)
)

# Fix the matric and Fsc/Ics marks
df['Your matric marks percentage?'] = np.where(
    df['Your matric marks percentage?'] > 100,
    df['Your matric marks percentage?'] * 100 / 1100,
    df['Your matric marks percentage?']
)

df["Your Fsc/Ics marks percentage?"] = np.where(
    df["Your Fsc/Ics marks percentage?"] > 100,
    df["Your Fsc/Ics marks percentage?"] * 100 / 1100,
    df["Your Fsc/Ics marks percentage?"]
)

df.round(decimals=2)

# Handle outliers using IQR method
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Factorize categorical columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] + 1
    return data

df = factorize_fun(df)

# Impute missing values using KNN
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns=df.columns)
    return df2

df = knn_null(df)

# Split data into features (X) and target (y)
X = df.drop(columns=['Please mention your Previous Semester GPA?'])
y = df['Please mention your Previous Semester GPA?']

def mean_absolute_percentage_error(y_true, y_pred, clip_threshold=1.0):
    # Clip the values in y_true to avoid division by very small numbers (close to 0)
    y_true_clipped = np.clip(y_true, clip_threshold, np.inf)  # Clip values below `clip_threshold`
    return np.mean(np.abs((y_true_clipped - y_pred) / y_true_clipped)) * 100

# Z-Score Normalization
def zscore_normalize(df):
    scaler = StandardScaler()  # Using StandardScaler for Z-score
    data_scaled = scaler.fit_transform(df)
    return pd.DataFrame(data_scaled, columns=df.columns)  # Return DataFrame to retain column names

X = zscore_normalize(X)

# Calculate VIF function
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF before training
vif_data = calculate_vif(X)
print("VIF values:\n", vif_data)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

print("OLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))

print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

print("Multivariable Linear Regression")
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

print("Decision Tree")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

print("Polynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

print("RandomForestRegressor")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))


VIF values:
                                            Feature       VIF
0                         Kindly mention your age?  1.226576
1             How many members are in your family?  1.048057
2                    Your matric marks percentage?  1.319144
3                   Your Fsc/Ics marks percentage?  1.444562
4                   Please mention your NTS score?  1.054354
5        How much you have interest in this domain  2.220499
6   Are you satisfied with your program selection?  2.395394
7                Please provide your current CGPA?  1.294690
8       Do you find your CS/SE subjects difficult?  1.277991
9                       Do you love your subjects?  1.764862
10                              How is your health  1.140766
11            Do you like your teacher methodology  1.123784
12          Kindly specify do you love travelling?  1.039357
13                      Do you love reading books?  1.061514
OLS Linear Regression
MAE Score:  2.938693946115207
MSE Score:  8.727283

# **With PCC**

In [13]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

# Load dataset
df = pd.read_csv('/content/students_responses_main.csv')
df_org = df.copy()

# Drop unnecessary columns
df.drop(columns= df.columns[88], inplace=True)
df.drop(columns= df.columns[74], inplace=True)
df.drop(columns= df.columns[58], inplace=True)
df.drop(columns = df.columns[12:50], inplace = True)
df.drop(columns= df.columns[10], inplace=True)
df.drop(columns= df.columns[8], inplace=True)
df.drop(columns= df.columns[7], inplace=True)
df.drop(columns= df.columns[0], inplace=True)

# Data cleaning
df['Kindly choose your current semester.'] = df['Kindly choose your current semester.'].str.replace('2 Semester','1 Semester',regex=True).str.replace('3 Semester','2 Semester',regex=True).str.replace('4 Semester','3 Semester',regex=True).str.replace('5 Semester','4 Semester',regex=True).str.replace('6 Semester','5 Semester',regex=True).str.replace('7 Semester','6 Semester',regex=True).str.replace('8 Semester','7 Semester',regex=True)

df['Your matric marks percentage?'] = np.where(df['Your matric marks percentage?'] > 100,
                                               df['Your matric marks percentage?'] * 100 / 1100,
                                               df['Your matric marks percentage?'])

df["Your Fsc/Ics marks percentage?"] = np.where(df["Your Fsc/Ics marks percentage?"] > 100,
                                                df["Your Fsc/Ics marks percentage?"]* 100/1100,
                                                df["Your Fsc/Ics marks percentage?"])

df.round(decimals=2)

# Remove outliers based on IQR
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.describe()

# Function to factorize object columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] + 1
    return data

df = factorize_fun(df)

# Handle missing data using KNN Imputation
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns = df.columns)
    return df2

df = knn_null(df)

# Normalize the data
def normalize(df):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(df)
    return data_scaled

# Apply Z-score Transformation to the features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[df_scaled.select_dtypes(include=['number']).columns] = scaler.fit_transform(df_scaled.select_dtypes(include=['number']))

# Correlation matrix
correlation_matrix = df_scaled.corr()

# Split the data
X = df_scaled.drop(columns=['Please mention your Previous Semester GPA?'])
y = df_scaled['Please mention your Previous Semester GPA?']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

# Define function to calculate PCC
def pearson_correlation_coefficient(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]

# Define function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred, clip_threshold=1.0):
    y_true_clipped = np.clip(y_true, clip_threshold, np.inf)
    return np.mean(np.abs((y_true_clipped - y_pred) / y_true_clipped)) * 100

# Linear Regression using statsmodels (OLS)
print("OLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Linear Regression using sklearn
print("Multivariable Linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Decision Tree Regression
print("Decision Tree")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Polynomial Regression
print("Polynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Random Forest Regressor
print("Random Forest Regressor")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))


OLS Linear Regression
MAE Score:  0.23937717188296959
MSE Score:  0.11545728939159992
RMSE Score:  0.33979006664645145
R2 score : 0.91
MAPE Score: 115.36
Multivariable Linear Regression
MAE Score:  0.23945235288153546
MSE Score:  0.11548212826645306
RMSE Score:  0.3398266150060249
R2 score : 0.91
MAPE Score: 115.44
Decision Tree
MAE Score:  0.4348854154605512
MSE Score:  0.4620155237937227
RMSE Score:  0.6797172381172355
R2 score : 0.64
MAPE Score: 122.92
Polynomial Regression
MAE Score:  1.2141004074256523
MSE Score:  7.095298276031791
RMSE Score:  2.663700110003337
R2 score : -4.52
MAPE Score: 200.97
Random Forest Regressor
MAE Score:  0.24868198735455052
MSE Score:  0.1113195769524924
RMSE Score:  0.3336458855620617
R2 score : 0.91
MAPE Score: 115.72
