# **With Variance infaltion factor**

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

# Load dataset
df  = pd.read_csv('students_responses_main.csv')
df_org = df.copy()
df.head()

# Data cleaning steps (same as original)
df.drop(columns= df.columns[88], inplace=True)
df.drop(columns= df.columns[74], inplace=True)
df.drop(columns= df.columns[58], inplace=True)
df.drop(columns = df.columns[12:50], inplace=True)
df.drop(columns= df.columns[10], inplace=True)
df.drop(columns= df.columns[8], inplace=True)
df.drop(columns= df.columns[7], inplace=True)
df.drop(columns= df.columns[0], inplace=True)

# Modify the semester column
df['Kindly choose your current semester.'] = df['Kindly choose your current semester.'].str.replace('2 Semester','1 Semester',regex=True).str.replace('3 Semester','2 Semester',regex=True).str.replace ('4 Semester','3 Semester',regex=True).str.replace ('5 Semester','4 Semester',regex=True).str.replace ('6 Semester','5 Semester',regex=True).str.replace('7 Semester','6 Semester',regex=True).str.replace('8 Semester','7 Semester',regex=True)

# Fix the matric and Fsc/Ics marks (same as original)
df['Your matric marks percentage?'] = np.where(df['Your matric marks percentage?'] > 100,
                                               df['Your matric marks percentage?'] * 100 / 1100,
                                               df['Your matric marks percentage?'])

df["Your Fsc/Ics marks percentage?"] = np.where(df["Your Fsc/Ics marks percentage?"] > 100,
                                                df["Your Fsc/Ics marks percentage?"]* 100/1100,
                                                df["Your Fsc/Ics marks percentage?"])

df.round(decimals=2)

# Handle outliers using IQR method (same as original)
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Factorize categorical columns (same as original)
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] +1
    return data

df = factorize_fun(df)

# Impute missing values using KNN (same as original)
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns = df.columns)
    return df2

df = knn_null(df)

# Log-transform the features and the target
df_log = df.copy()
df_log = df_log.applymap(lambda x: np.log(x) if x > 0 else 0)  # Apply log transform only to positive values

# Split data into features (X) and target (y)
X = df_log.drop(columns=['Please mention your Previous Semester GPA?'])
y = df_log['Please mention your Previous Semester GPA?']

# Normalize the features
def normalize(df):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(df)
    return pd.DataFrame(data_scaled, columns=df.columns)  # Return DataFrame to retain column names

X = normalize(X)

# Calculate VIF function (same as original)
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Calculate VIF before training
vif_data = calculate_vif(X)
print("VIF values:\n", vif_data)

# Train-test split (same as original)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

# OLS Linear Regression
print("ols linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))

print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Multivariable Linear Regression
print("multivariable linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Decision Tree
print("Decission Tree")
regressor = DecisionTreeRegressor(random_state = 0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Polynomial Regression
print("Polynomial Regression")
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# RandomForestRegressor
print("RandomForestRegressor")
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))


  df_log = df_log.applymap(lambda x: np.log(x) if x > 0 else 0)  # Apply log transform only to positive values


VIF values:
                                            Feature        VIF
0                         Kindly mention your age?   7.550438
1             How many members are in your family?  17.566814
2                    Your matric marks percentage?  29.770506
3                   Your Fsc/Ics marks percentage?  26.783510
4                   Please mention your NTS score?  25.717172
5        How much you have interest in this domain  33.557429
6   Are you satisfied with your program selection?  26.264332
7                Please provide your current CGPA?  33.243398
8       Do you find your CS/SE subjects difficult?  11.234943
9                       Do you love your subjects?  15.661183
10                              How is your health  15.219516
11            Do you like your teacher methodology   8.187363
12          Kindly specify do you love travelling?  11.065933
13                      Do you love reading books?   4.472995
ols linear Regression
MAE Score:  0.04964231008953195
MSE

# **with Mutual Information**

In [2]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('students_responses_main.csv')
df_org = df.copy()
df.head()

# Data cleaning steps
df.drop(columns= df.columns[88], inplace=True)
df.drop(columns= df.columns[74], inplace=True)
df.drop(columns= df.columns[58], inplace=True)
df.drop(columns = df.columns[12:50], inplace=True)
df.drop(columns= df.columns[10], inplace=True)
df.drop(columns= df.columns[8], inplace=True)
df.drop(columns= df.columns[7], inplace=True)
df.drop(columns= df.columns[0], inplace=True)

# Modify the semester column
df['Kindly choose your current semester.'] = df['Kindly choose your current semester.'].str.replace('2 Semester','1 Semester',regex=True).str.replace('3 Semester','2 Semester',regex=True).str.replace ('4 Semester','3 Semester',regex=True).str.replace ('5 Semester','4 Semester',regex=True).str.replace ('6 Semester','5 Semester',regex=True).str.replace('7 Semester','6 Semester',regex=True).str.replace('8 Semester','7 Semester',regex=True)

# Fix the matric and Fsc/Ics marks with log transformation
df['Your matric marks percentage?'] = np.where(df['Your matric marks percentage?'] > 100,
                                               df['Your matric marks percentage?'] * 100 / 1100,
                                               df['Your matric marks percentage?'])
df["Your Fsc/Ics marks percentage?"] = np.where(df["Your Fsc/Ics marks percentage?"] > 100,
                                                df["Your Fsc/Ics marks percentage?"] * 100 / 1100,
                                                df["Your Fsc/Ics marks percentage?"])

# Apply log transformation to these columns
df['Your matric marks percentage?'] = df['Your matric marks percentage?'].apply(lambda x: np.log(x + 1) if x > 0 else 0)
df["Your Fsc/Ics marks percentage?"] = df["Your Fsc/Ics marks percentage?"].apply(lambda x: np.log(x + 1) if x > 0 else 0)

df.round(decimals=2)

# Handle outliers using IQR method
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Factorize categorical columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] + 1
    return data

df = factorize_fun(df)

# Impute missing values using KNN
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns = df.columns)
    return df2

df = knn_null(df)

# Split data into features (X) and target (y)
X = df.drop(columns=['Please mention your Previous Semester GPA?'])
y = df['Please mention your Previous Semester GPA?']

# Normalize the features
def normalize(df):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(df)
    return pd.DataFrame(data_scaled, columns=df.columns)  # Return DataFrame to retain column names

X = normalize(X)

# Mutual Information for Feature Selection (regression)
def mutual_info_selection(X, y, k=10):
    # Select top k features based on mutual information for regression
    mi_selector = SelectKBest(mutual_info_regression, k=k)
    X_new = mi_selector.fit_transform(X, y)
    selected_columns = X.columns[mi_selector.get_support()]
    print(f"Selected features after Mutual Information test: {selected_columns}")
    return X_new, selected_columns

X_new, selected_columns = mutual_info_selection(X, y, k=10)

# Train-test split with selected features
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.1, random_state=4)

# Linear Regression Model
print("OLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Multivariable Linear Regression using Sklearn
print("Multivariable Linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Decision Tree Regressor
print("Decision Tree")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Polynomial Regression
print("Polynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# RandomForestRegressor
print("Random Forest Regressor")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test, y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))


Selected features after Mutual Information test: Index(['Kindly mention your age?', 'How many members are in your family?',
       'Your matric marks percentage?', 'Your Fsc/Ics marks percentage?',
       'Please mention your NTS score?',
       'Are you satisfied with your program selection?',
       'Please provide your current CGPA?',
       'Do you find your CS/SE subjects difficult?',
       'Do you love your subjects?', 'Do you love reading books?'],
      dtype='object')
OLS Linear Regression
MAE Score:  0.25450171730785004
MSE Score:  0.19191155364123239
RMSE Score:  0.4380771092413211
R2 score : 0.57
MAPE Score: 0.13
Multivariable Linear Regression
MAE Score:  0.2166478101628042
MSE Score:  0.17534270200162239
RMSE Score:  0.41873942016679344
R2 score : 0.61
MAPE Score: 0.11
Decision Tree
MAE Score:  0.2619824561403508
MSE Score:  0.20864577192982456
RMSE Score:  0.4567775956960067
R2 score : 0.53
MAPE Score: 0.13
Polynomial Regression
MAE Score:  1.70571654976469
MSE Score:  

# **CHI-square**

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm

# Load dataset
df = pd.read_csv('students_responses_main.csv')
df_org = df.copy()

# Data cleaning steps
df.drop(columns= df.columns[88], inplace=True)
df.drop(columns= df.columns[74], inplace=True)
df.drop(columns= df.columns[58], inplace=True)
df.drop(columns = df.columns[12:50], inplace=True)
df.drop(columns= df.columns[10], inplace=True)
df.drop(columns= df.columns[8], inplace=True)
df.drop(columns= df.columns[7], inplace=True)
df.drop(columns= df.columns[0], inplace=True)

# Modify the semester column
df['Kindly choose your current semester.'] = df['Kindly choose your current semester.'].str.replace('2 Semester','1 Semester',regex=True).str.replace('3 Semester','2 Semester',regex=True).str.replace ('4 Semester','3 Semester',regex=True).str.replace ('5 Semester','4 Semester',regex=True).str.replace ('6 Semester','5 Semester',regex=True).str.replace('7 Semester','6 Semester',regex=True).str.replace('8 Semester','7 Semester',regex=True)

# Fix the matric and Fsc/Ics marks
df['Your matric marks percentage?'] = np.where(df['Your matric marks percentage?'] > 100,
                                               df['Your matric marks percentage?'] * 100 / 1100,
                                               df['Your matric marks percentage?'])

df["Your Fsc/Ics marks percentage?"] = np.where(df["Your Fsc/Ics marks percentage?"] > 100,
                                                df["Your Fsc/Ics marks percentage?"]* 100/1100,
                                                df["Your Fsc/Ics marks percentage?"])

df.round(decimals=2)

# Handle outliers using IQR method
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Factorize categorical columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] +1
    return data

df = factorize_fun(df)

# Impute missing values using KNN
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns = df.columns)
    return df2

df = knn_null(df)

# Split data into features (X) and target (y)
X = df.drop(columns=['Please mention your Previous Semester GPA?'])
y = df['Please mention your Previous Semester GPA?']

# Apply log transformation to features
def log_transform(df):
    return np.log1p(df)

X_log = log_transform(X)

# Modified MAPE calculation to handle small values in the denominator
def mean_absolute_percentage_error(y_true, y_pred):
    # Adding a small epsilon value to the denominator to prevent division by zero
    epsilon = 1e-10
    return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

# Use this modified version of MAPE in your code where needed
# print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Normalize the features
def normalize(df):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(df)
    return pd.DataFrame(data_scaled, columns=df.columns)  # Return DataFrame to retain column names

X_log = normalize(X_log)

# Discretize the target variable to make it categorical
def discretize_target(y, n_bins=5):
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    y_discretized = discretizer.fit_transform(y.values.reshape(-1, 1))
    return y_discretized.flatten()  # Flatten the array to avoid shape issues

y_discretized = discretize_target(y)

# Chi-Square feature selection
def chi_square_selection(X, y, k=10):
    chi2_selector = SelectKBest(chi2, k=k)
    X_new = chi2_selector.fit_transform(X, y)
    selected_columns = X.columns[chi2_selector.get_support()]
    print(f"Selected features after Chi-Square test: {selected_columns}")
    return X_new, selected_columns

X_new, selected_columns = chi_square_selection(X_log, y_discretized, k=10)

# Train-test split with selected features
X_train, X_test, y_train, y_test = train_test_split(X_new, y_discretized, test_size=0.1, random_state=4)

# Linear Regression using statsmodels (OLS)
print("OLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Linear Regression using sklearn
print("Multivariable Linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Decision Tree Regression
print("Decision Tree")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Polynomial Regression
print("Polynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Random Forest Regressor
print("Random Forest Regressor")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))


Selected features after Chi-Square test: Index(['Kindly mention your age?', 'Your Fsc/Ics marks percentage?',
       'Please mention your NTS score?',
       'How much you have interest in this domain',
       'Are you satisfied with your program selection?',
       'Please provide your current CGPA?',
       'Do you find your CS/SE subjects difficult?',
       'Do you love your subjects?', 'Kindly specify do you love travelling?',
       'Do you love reading books?'],
      dtype='object')
OLS Linear Regression
MAE Score:  0.2935206848407386
MSE Score:  0.1418303995426236
RMSE Score:  0.3766037699527497
R2 score : 0.81
MAPE Score: 3346783332.77
Multivariable Linear Regression
MAE Score:  0.31316518040673935
MSE Score:  0.15906548517979346
RMSE Score:  0.39883014577611037
R2 score : 0.79
MAPE Score: 6633517760.52
Decision Tree
MAE Score:  0.39655172413793105
MSE Score:  0.43103448275862066
RMSE Score:  0.6565321642986127
R2 score : 0.42
MAPE Score: 15.37
Polynomial Regression
MAE Score

# **with Pearson Correlation Coefficient **

In [7]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

# Load dataset
df = pd.read_csv('/content/students_responses_main.csv')
df_org = df.copy()

# Drop unnecessary columns
df.drop(columns= df.columns[88], inplace=True)
df.drop(columns= df.columns[74], inplace=True)
df.drop(columns= df.columns[58], inplace=True)
df.drop(columns = df.columns[12:50], inplace = True)
df.drop(columns= df.columns[10], inplace=True)
df.drop(columns= df.columns[8], inplace=True)
df.drop(columns= df.columns[7], inplace=True)
df.drop(columns= df.columns[0], inplace=True)

# Data cleaning
df['Kindly choose your current semester.'] = df['Kindly choose your current semester.'].str.replace('2 Semester','1 Semester',regex=True).str.replace('3 Semester','2 Semester',regex=True).str.replace('4 Semester','3 Semester',regex=True).str.replace('5 Semester','4 Semester',regex=True).str.replace('6 Semester','5 Semester',regex=True).str.replace('7 Semester','6 Semester',regex=True).str.replace('8 Semester','7 Semester',regex=True)

df['Your matric marks percentage?'] = np.where(df['Your matric marks percentage?'] > 100,
                                               df['Your matric marks percentage?'] * 100 / 1100,
                                               df['Your matric marks percentage?'])

df["Your Fsc/Ics marks percentage?"] = np.where(df["Your Fsc/Ics marks percentage?"] > 100,
                                                df["Your Fsc/Ics marks percentage?"]* 100/1100,
                                                df["Your Fsc/Ics marks percentage?"])

df.round(decimals=2)

# Remove outliers based on IQR
df = df.select_dtypes(include=['number'])
Q1 = df.quantile(0.15)
Q3 = df.quantile(0.85)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.describe()

# Function to factorize object columns
def factorize_fun(data):
    obj_cols = data.loc[:, data.dtypes == object].columns
    for col in obj_cols:
        data[col] = pd.factorize(data[col])[0] + 1
    return data

df = factorize_fun(df)

# Handle missing data using KNN Imputation
def knn_null(df):
    imputer = KNNImputer(n_neighbors=2)
    df1 = imputer.fit_transform(df)
    df2 = pd.DataFrame(df1, columns = df.columns)
    return df2

df = knn_null(df)

# Normalize the data
def normalize(df):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(df)
    return data_scaled

# Apply Log Transformation to the features and target
df = df.applymap(lambda x: np.log(x + 1) if isinstance(x, (int, float)) and x > 0 else x)  # Log transform for all numeric values
df['Please mention your Previous Semester GPA?'] = np.log(df['Please mention your Previous Semester GPA?'] + 1)

# Correlation matrix
correlation_matrix = df.corr()

# Split the data
X = df.drop(columns=['Please mention your Previous Semester GPA?'])
X = normalize(X)

y = df['Please mention your Previous Semester GPA?']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

# Define function to calculate PCC
def pearson_correlation_coefficient(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]

# Define function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred, clip_threshold=1.0):
    # Clip the values in y_true to avoid division by very small numbers (close to 0)
    y_true_clipped = np.clip(y_true, clip_threshold, np.inf)  # Clip values below `clip_threshold`
    return np.mean(np.abs((y_true_clipped - y_pred) / y_true_clipped)) * 100

# Linear Regression using statsmodels (OLS)
print("OLS Linear Regression")
model = sm.OLS(endog=y_train, exog=X_train).fit()
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Linear Regression using sklearn
print("Multivariable Linear Regression")
regression = LinearRegression()
model = regression.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Decision Tree Regression
print("Decision Tree")
regressor = DecisionTreeRegressor(random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Polynomial Regression
print("Polynomial Regression")
poly_reg = PolynomialFeatures(degree=4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
model = regressor.fit(X_poly, y_train)
y_pred = model.predict(poly_reg.transform(X_test))

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))

# Random Forest Regressor
print("Random Forest Regressor")
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
model = regressor.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("MAE Score: ", mean_absolute_error(y_test, y_pred))
print("MSE Score: ", mean_squared_error(y_test, y_pred))
print("RMSE Score: ", math.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 score : %.2f" % r2_score(y_test,y_pred))
print("MAPE Score: %.2f" % mean_absolute_percentage_error(y_test, y_pred))


  df = df.applymap(lambda x: np.log(x + 1) if isinstance(x, (int, float)) and x > 0 else x)  # Log transform for all numeric values


OLS Linear Regression
MAE Score:  0.05245316808061698
MSE Score:  0.004379135498727865
RMSE Score:  0.06617503682452971
R2 score : 0.30
MAPE Score: 17.36
Multivariable Linear Regression
MAE Score:  0.01606182948776756
MSE Score:  0.000603202897669323
RMSE Score:  0.024560189284069514
R2 score : 0.90
MAPE Score: 16.01
Decision Tree
MAE Score:  0.03378107342033408
MSE Score:  0.004126990259173644
RMSE Score:  0.06424165517149791
R2 score : 0.34
MAPE Score: 16.34
Polynomial Regression
MAE Score:  0.08670118706671123
MSE Score:  0.01763474782124966
RMSE Score:  0.13279588781754373
R2 score : -1.81
MAPE Score: 18.45
Random Forest Regressor
MAE Score:  0.01850103711377303
MSE Score:  0.0007251488516715618
RMSE Score:  0.026928587999959484
R2 score : 0.88
MAPE Score: 16.18
