In [1]:

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('preprocessdata.csv')
dataset= pd.get_dummies(dataset, drop_first=True)
X = dataset.drop('diagnosis_M', axis=1)
y = dataset['diagnosis_M']

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Applying RFE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfe =RFE(estimator=rf, n_features_to_select=5)
X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

# Fitting Logistic Regression to the Training set
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid={'n_estimators' :[100], 'criterion':['gini', 'entropy', 'log_loss'],
           'max_features':['sqrt', 'log2'], 'class_weight':['balanced', 'balanced_subsample']}


grid=GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=3,n_jobs=-1,scoring='f1_weighted')
grid.fit(X_train,y_train)
re=grid.cv_results_
y_predict=grid.predict(X_test)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_predict)
from sklearn.metrics import classification_report
clf_report = classification_report(y_test,y_predict)
from sklearn.metrics import f1_score
f1_macro=f1_score(y_test,y_predict,average='weighted')
print("The f1 macro value the best parameter{}:".format(grid.best_params_),f1_macro)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Fitting 5 folds for each of 12 candidates, totalling 60 fits
The f1 macro value the best parameter{'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 100}: 0.9736401936741494


In [2]:
print(cm)

[[66  1]
 [ 2 45]]


In [4]:
from sklearn.metrics import classification_report
clf_report = classification_report(y_test,y_predict)

In [5]:
print(clf_report)

              precision    recall  f1-score   support

       False       0.97      0.99      0.98        67
        True       0.98      0.96      0.97        47

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [6]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])


0.9944426802159416

In [7]:
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.306317,0.011492,0.0137,0.006059,balanced,gini,sqrt,100,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922215,0.95573,0.922814,0.95573,0.956305,0.942559,0.016368,1
1,0.292839,0.01149,0.014385,0.00873,balanced,gini,log2,100,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922215,0.933594,0.933594,0.944439,0.956305,0.93803,0.011528,5
2,0.309922,0.014099,0.013407,0.004448,balanced,entropy,sqrt,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922215,0.922814,0.922814,0.944439,0.956305,0.933717,0.014108,9
3,0.303646,0.004801,0.015541,0.003373,balanced,entropy,log2,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922215,0.933594,0.899991,0.944439,0.94552,0.929152,0.016851,12
4,0.308751,0.009226,0.012117,0.006731,balanced,log_loss,sqrt,100,"{'class_weight': 'balanced', 'criterion': 'log...",0.922215,0.933594,0.922814,0.944439,0.956305,0.935874,0.013062,6
5,0.302376,0.004836,0.013583,0.003946,balanced,log_loss,log2,100,"{'class_weight': 'balanced', 'criterion': 'log...",0.910717,0.944867,0.899991,0.95573,0.956305,0.933522,0.023602,10
6,0.363462,0.010682,0.016245,0.004032,balanced_subsample,gini,sqrt,100,"{'class_weight': 'balanced_subsample', 'criter...",0.922215,0.956044,0.922814,0.95573,0.94552,0.940465,0.015139,3
7,0.365798,0.010932,0.015412,0.005729,balanced_subsample,gini,log2,100,"{'class_weight': 'balanced_subsample', 'criter...",0.922215,0.956044,0.933594,0.944439,0.956305,0.94252,0.01318,2
8,0.381903,0.012652,0.010835,0.008302,balanced_subsample,entropy,sqrt,100,"{'class_weight': 'balanced_subsample', 'criter...",0.922215,0.956044,0.922814,0.944439,0.956305,0.940363,0.015192,4
9,0.420487,0.022396,0.013917,0.004404,balanced_subsample,entropy,log2,100,"{'class_weight': 'balanced_subsample', 'criter...",0.922215,0.944439,0.911459,0.944439,0.956305,0.935772,0.016412,7


In [8]:
import pickle
filename="RANDOM_final.sav"
pickle.dump(grid,open(filename,'wb'))
loaded_model=pickle.load(open("RANDOM_final.sav",'rb'))


In [9]:
X

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.80,1001.0,0.118400,0.22862,0.28241,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,1937.05,0.16220,0.62695,0.7119,0.2654,0.41915,0.11890
1,20.57,17.77,132.90,1326.0,0.084740,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1937.05,0.12380,0.18660,0.2416,0.1860,0.27500,0.08902
2,19.69,21.25,130.00,1203.0,0.109600,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.00,0.14440,0.42450,0.4504,0.2430,0.36130,0.08758
3,11.42,20.38,77.58,386.1,0.133695,0.22862,0.24140,0.10520,0.2464,0.07875,...,14.910,26.50,98.87,567.70,0.19010,0.62695,0.6869,0.2575,0.41915,0.12301
4,20.29,14.34,135.10,1297.0,0.100300,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.00,0.13740,0.20500,0.4000,0.1625,0.23640,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1326.3,0.111000,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,1937.05,0.14100,0.21130,0.4107,0.2216,0.20600,0.07115
565,20.13,28.25,131.20,1261.0,0.097800,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.00,0.11660,0.19220,0.3215,0.1628,0.25720,0.06637
566,16.60,28.08,108.30,858.1,0.084550,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.00,0.11390,0.30940,0.3403,0.1418,0.22180,0.07820
567,20.60,29.33,140.10,1265.0,0.117800,0.22862,0.28241,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.00,0.16500,0.62695,0.7855,0.2650,0.40870,0.12301


In [None]:
radius_mean_input = float(input("radius_mean: "))
concavity_mean_input = float(input("concavity_mean: "))
radius_worst_input = float(input("radius_worst: "))
compactness_worst_input = float(input("compactness_worst: "))
concavity_worst_input = float(input("concavity_worst: "))


In [13]:
future_Prediction=loaded_model.predict([[radius_mean_input,concavity_mean_input,radius_worst_input,compactness_worst_input,concavity_worst_input]])# change the paramter,play with it.
print("Future_Prediction={}".format(future_Prediction))

Future_Prediction=[ True]
