### Testing different Models to find the best model

In [19]:
import pandas as pd
df = pd.read_csv('function_quality_scores.csv')
print(df.head())
print(df.isnull().sum())
print(df.dtypes)
print(df.describe())

                 Function_Name  Cyclomatic_Complexity  Function_Length  \
0  find_most_frequent_elements                      2                4   
1                     do_thing                      4                6   
2               reverse_string                      1                3   
3               calculate_area                      2                5   
4                   sum_values                      1                2   

   Number_of_Loops  Modularity  Comment_Quality  Naming_Quality  Final_Score  
0                1           9                7               8          8.0  
1                2           3                0               3          2.0  
2                1          10                9              10          9.0  
3                0           9                6               7          7.0  
4                0          10                8               9          8.0  
Function_Name            0
Cyclomatic_Complexity    0
Function_Length          0


In [20]:
from sklearn.preprocessing import MinMaxScaler
columns_to_scale = ['Cyclomatic_Complexity', 'Function_Length', 'Modularity',  'Number_of_Loops','Comment_Quality', 'Naming_Quality']
# Min-Max scaling
scaler = MinMaxScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


In [21]:
# Correlation
numeric_columns = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()
# Print the correlation matrix
print(correlation_matrix)


                       Cyclomatic_Complexity  Function_Length  \
Cyclomatic_Complexity               1.000000         0.641808   
Function_Length                     0.641808         1.000000   
Number_of_Loops                     0.574696         0.798431   
Modularity                         -0.511331        -0.425850   
Comment_Quality                    -0.300791        -0.487168   
Naming_Quality                     -0.057087        -0.517978   
Final_Score                        -0.236080        -0.528249   

                       Number_of_Loops  Modularity  Comment_Quality  \
Cyclomatic_Complexity         0.574696   -0.511331        -0.300791   
Function_Length               0.798431   -0.425850        -0.487168   
Number_of_Loops               1.000000   -0.371367        -0.503658   
Modularity                   -0.371367    1.000000         0.352138   
Comment_Quality              -0.503658    0.352138         1.000000   
Naming_Quality               -0.506513    0.462918   

In [22]:
X = numeric_columns.drop(columns=['Final_Score'])  # Features
y = df['Final_Score']  # Target (Final Score)


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
#RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

modelRF = RandomForestRegressor(random_state=42)
modelRF.fit(X_train, y_train)

# Predictions
y_pred = modelRF.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.4659583333333333
R-squared: 0.9673339572318572


In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

modelLR = LinearRegression()
modelLR.fit(X_train, y_train)

# Predictions
y_pred = modelLR.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.37780716957105015
R-squared: 0.9809062104631733


In [27]:
from sklearn.tree import DecisionTreeRegressor

modelDT = DecisionTreeRegressor(random_state=42)
modelDT.fit(X_train, y_train)

# Predictions
y_pred = modelDT.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.875
R-squared: 0.8686647583768311


In [28]:
from sklearn.svm import SVR

modelSVR = SVR()
modelSVR.fit(X_train, y_train)

# Predictions
y_pred = modelSVR.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.6534615806354951
R-squared: 0.9324477089435264


In [29]:
from sklearn.ensemble import GradientBoostingRegressor

modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

# Predictions
y_pred = modelGB.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.40590710267047087
R-squared: 0.9761764845812602


In [30]:
from sklearn.neighbors import KNeighborsRegressor

modelKNN = KNeighborsRegressor(n_neighbors=5)
modelKNN.fit(X_train, y_train)

# Predictions
y_pred = modelKNN.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.8416666666666667
R-squared: 0.8925879777740361


In [31]:
from sklearn.linear_model import Lasso

modelLasso = Lasso(alpha=0.1)
modelLasso.fit(X_train, y_train)

# Predictions
y_pred = modelLasso.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.5815794067929102
R-squared: 0.962429828041204


In [32]:
from sklearn.linear_model import Ridge

modelRidge = Ridge(alpha=1.0)
modelRidge.fit(X_train, y_train)

# Predictions
y_pred = modelRidge.predict(X_test)

# Evaluate the model
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.5235931647611062
R-squared: 0.9679289513258491


### Best Performance:  Random Forest, Linear, Gradient, Lasso, Ridge Regressions