### Random Forest Training, Testing & Pickle Dumping

In [1]:
import pandas as pd
df = pd.read_csv('function_quality_scores.csv')
print(df.head())
print(df.isnull().sum())
print(df.dtypes)
print(df.describe())

                 Function_Name  Cyclomatic_Complexity  Function_Length  \
0  find_most_frequent_elements                      2                4   
1                     do_thing                      4                6   
2               reverse_string                      1                3   
3               calculate_area                      2                5   
4                   sum_values                      1                2   

   Number_of_Loops  Modularity  Comment_Quality  Naming_Quality  Final_Score  
0                1           9                7               8          8.0  
1                2           3                0               3          2.0  
2                1          10                9              10          9.0  
3                0           9                6               7          7.0  
4                0          10                8               9          8.0  
Function_Name            0
Cyclomatic_Complexity    0
Function_Length          0


### Data Pre-Processing

In [4]:
from sklearn.preprocessing import MinMaxScaler

columns_to_scale = ['Cyclomatic_Complexity', 'Function_Length', 'Modularity',  'Number_of_Loops','Comment_Quality', 'Naming_Quality']
scaler = MinMaxScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


In [5]:
df

Unnamed: 0,Function_Name,Cyclomatic_Complexity,Function_Length,Number_of_Loops,Modularity,Comment_Quality,Naming_Quality,Final_Score
0,find_most_frequent_elements,0.111111,0.25,0.2,0.888889,0.7,0.8,8.0
1,do_thing,0.333333,0.5,0.4,0.222222,0.0,0.3,2.0
2,reverse_string,0.0,0.125,0.2,1.0,0.9,1.0,9.0
3,calculate_area,0.111111,0.375,0.0,0.888889,0.6,0.7,7.0
4,sum_values,0.0,0.0,0.0,1.0,0.8,0.9,8.0
5,find_duplicates,0.222222,0.625,0.4,0.333333,0.3,0.5,4.0
6,generate_random_numbers,0.0,0.375,0.2,0.777778,0.7,0.9,8.0
7,filter_odd_numbers,0.0,0.25,0.2,1.0,0.8,0.9,9.0
8,calculate_average,0.111111,0.375,0.2,0.888889,0.7,0.8,8.0
9,extract_unique_elements,0.0,0.25,0.2,1.0,0.6,0.9,8.0


In [6]:
# Correlation Matrix
numeric_columns = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()
print(correlation_matrix)


                       Cyclomatic_Complexity  Function_Length  \
Cyclomatic_Complexity               1.000000         0.641808   
Function_Length                     0.641808         1.000000   
Number_of_Loops                     0.574696         0.798431   
Modularity                         -0.511331        -0.425850   
Comment_Quality                    -0.300791        -0.487168   
Naming_Quality                     -0.057087        -0.517978   
Final_Score                        -0.236080        -0.528249   

                       Number_of_Loops  Modularity  Comment_Quality  \
Cyclomatic_Complexity         0.574696   -0.511331        -0.300791   
Function_Length               0.798431   -0.425850        -0.487168   
Number_of_Loops               1.000000   -0.371367        -0.503658   
Modularity                   -0.371367    1.000000         0.352138   
Comment_Quality              -0.503658    0.352138         1.000000   
Naming_Quality               -0.506513    0.462918   

In [9]:
X = numeric_columns.drop(columns=['Final_Score'])  # Features
y = df['Final_Score']  # Target (Final Score)

### Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Training, Testing and Evaluation

In [12]:
#RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


Mean Absolute Error: 0.4659583333333333
R-squared: 0.9673339572318572


### Pickle Dumping

In [55]:
import pickle

In [45]:
import os
model_directory = r'D:\Google-Girl-Hackathon-2025\Model'
model_path = os.path.join(model_directory, 'ModelRF.pkl')

# Save the model to the full path
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

# Set the full path for saving the scaler
scaler_path = os.path.join(model_directory, 'scaler.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved successfully.")

Model and scaler saved successfully.


In [59]:
import os
model_directory = r'D:\Google-Girl-Hackathon-2025\Model'
model_path = os.path.join(model_directory, 'Model.pkl')

# Save the model to the full path
with open(model_path, 'wb') as f:
    pickle.dump(model, f)