In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('glioma_grading.csv')

# Check for missing values
print(df.isnull().sum())

# Remove rows with missing values
df.dropna(inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)


Grade               0
Gender              0
Age_at_diagnosis    0
Race                0
IDH1                0
TP53                0
ATRX                0
PTEN                0
EGFR                0
CIC                 0
MUC16               0
PIK3CA              0
NF1                 0
PIK3R1              0
FUBP1               0
RB1                 0
NOTCH1              0
BCOR                0
CSMD3               0
SMARCA4             0
GRIN2A              0
IDH2                0
FAT4                0
PDGFRA              0
dtype: int64


In [None]:
df.head()

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,0,51.3,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,38.72,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,35.17,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,32.78,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,31.51,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('Grade', axis=1)
y = df['Grade']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the shapes of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((670, 23), (168, 23), (670,), (168,))

In [None]:
# Fill missing values with mean imputation
df.fillna(df.mean(), inplace=True)

# Or fill missing values with median imputation
df.fillna(df.median(), inplace=True)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the data using Min-Max scaling
scaler = MinMaxScaler()
df['Age_at_diagnosis'] = scaler.fit_transform(df['Age_at_diagnosis'].values.reshape(-1, 1))


APPLYING 3 Regression/Classification Models

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
rf = RandomForestClassifier()

# Train the model
rf.fit(X_train, y_train)

# Predict the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))


Accuracy: 82.14%


Support Vector Machine (SVM) Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create the model
svm = SVC()

# Train the model
svm.fit(X_train, y_train)

# Predict the test set
y_pred = svm.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.76      0.79      0.78        97
           1       0.70      0.66      0.68        71

    accuracy                           0.74       168
   macro avg       0.73      0.73      0.73       168
weighted avg       0.74      0.74      0.74       168



Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Create the model
gbr = GradientBoostingRegressor()

# Train the model
gbr.fit(X_train, y_train)

# Predict the test set
y_pred = gbr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:.2f}".format(mse))


Mean Squared Error: 0.13


Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters: ", grid_search.best_params_)

# Predict the test set using the best model
y_pred = grid_search.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Hyperparameters:  {'bootstrap': True, 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 83.93%


Comparing The Results

In [None]:

# Assuming X and y are your feature matrix and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# SVM Classifier
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)

# After hyperparameter tuning for Random Forest
rf_tuned = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) # Example tuned parameters
rf_tuned.fit(X_train, y_train)
y_pred_rf_tuned = rf_tuned.predict(X_test)

# Compare the accuracy and performance
print("Random Forest Classifier Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_rf)*100))
print("SVM Classifier Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_svm)*100))
print("Gradient Boosting Regressor MSE: {:.2f}".format(mean_squared_error(y_test, y_pred_gbr)))

# Compare the accuracy of the Random Forest Classifier before and after hyperparameter tuning
print("Random Forest Classifier Accuracy before tuning: {:.2f}%".format(accuracy_score(y_test, y_pred_rf)*100))
print("Random Forest Classifier Accuracy after tuning: {:.2f}%".format(accuracy_score(y_test, y_pred_rf_tuned)*100))

