In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Lasso
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import matplotlib 
import seaborn as sns
matplotlib.rcParams["figure.figsize"] = (20,10)

# Import data and process

In [None]:
df=pd.read_csv('../input/anemia-dataset/anemia.csv')

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
features=df.columns
for feature in features:
     print(f'{feature}--->{df[feature].nunique()}')

In [None]:
df.describe().T.sort_values(ascending =0,by='mean').style.background_gradient(cmap='BuGn').bar(subset=['std'], color='red').bar(subset=['mean'], color='blue')

In [None]:
df.isnull().sum()

In [None]:
df.duplicated()

In [None]:
df=df.drop_duplicates()

In [None]:
df.duplicated()

In [None]:
df.shape

In [None]:
correl=df.corr()
fig = plt.figure(figsize=(15,7))
sns.heatmap(correl,annot=True)

In [None]:
df.columns

In [None]:
#continent_data.sort_values(by = '2022 Population',inplace = True)
plt.figure(figsize = (12,9))
features =['Gender', 'Hemoglobin', 'MCH', 'MCHC', 'MCV', 'Result']
features.reverse()# reverses the sorting order of the elements.
for feature in features:
    plt.plot(df[feature],label = feature)
plt.legend()
plt.show()

In [None]:
pd.crosstab(df['Gender'], df['Result'])

In [None]:
pd.crosstab(df['Gender'], df['Result']).plot(kind='bar')

In [None]:
sns.countplot(x='Gender',data=df)
plt.show()

In [None]:
#pd.crosstab(df['Gender'], df['Hemoglobin']).plot(kind='bar')


In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize = (6,5))
sns.boxplot(x='Gender',y='Hemoglobin',data=df)
plt.show()
plt.figure(figsize = (6,5))
sns.boxplot(x='Gender',y='MCHC',data=df)
plt.show()

In [None]:
df.columns

In [None]:
sns.distplot(df['MCH'])
plt.show()

# the model

In [None]:
X=df.drop(['Result'],axis='columns')

In [None]:
y=df.Result

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
model_params = {
     'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
      'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
    
}

In [None]:
scores = []
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(X,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

In [None]:
reg=RandomForestClassifier()
reg.fit(X_train, y_train)

In [None]:
reg.score( X_test, y_test)

In [None]:
y_predicted = reg.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

In [None]:
print(classification_report(y_test, y_predicted))

In [None]:
reg.predict(X_test)

In [None]:
def predict_anim(Gender,Hemoglobin,MCH,MCHC,MCV):    
    gen_index = np.where(X.columns==Gender)[0]#
    x = np.zeros(len(X.columns))
    x[0] = Hemoglobin
    x[1] = MCH
    x[2] = MCHC
    x[3] = MCV
  
    if gen_index >= 0:
         x[gen_index] = 1

    return reg.predict([x])[0]

In [None]:
predict_anim(1,5,10,48,21)

In [None]:
import pickle
with open('anemia.pickle','wb') as f:
    pickle.dump(reg,f)