In [None]:
#Loading Packages
import pandas as pd 
import numpy as np                     # For mathematical calculations 
import seaborn as sns                  # For data visualization 
import matplotlib.pyplot as plt        # For plotting graphs 
%matplotlib inline 
import warnings   # To ignore any warnings 
warnings.filterwarnings("ignore")



In [None]:
# Dataset
df=pd.read_csv('../input/microbes-dataset/microbes.csv')
df.head(20)

In [None]:
#Removing 
df = df.drop(['Unnamed: 0'], axis = 1)


# EDA

In [None]:
# Duplicates VALUE 

print(f'Duplicates in the dataset: {df.duplicated().sum()}')
print(f'Percentage of duplicates: {df.duplicated().sum()/len(df)*100}%')


In [None]:
df = df.drop_duplicates()
print(f'Duplicates in the dataset: {df.duplicated().sum()}')
print(f'Percentage of duplicates: {df.duplicated().sum()/len(df)}%')
print(f'Dataset shape: {df.shape}')


In [None]:
#Cardinality 
df.nunique()



In [None]:
#Data Types 
df.dtypes



In [None]:
# Target Distribution
# Figure size 
plt.figure(figsize=(10,10))
# Pie plot
df['microorganisms'].value_counts().plot.pie(autopct='%1.1f%%', textprops={'fontsize':12}).set_title("Target distribution")


In [None]:
df['microorganisms'].value_counts()


In [None]:
#Encoding First the 'Not Ulothrix' part of the dataset.
df.loc[df['microorganisms'] != 'Ulothrix', 'microorganisms'] = 'Not Ulothrix'
#Applying undersampling

In [None]:
# Function for balancing
def sampling_k_elements(microCount, k=1215):
    if len(microCount) < k:
        return microCount
    return microCount.sample(k)

df = df.groupby('microorganisms').apply(sampling_k_elements).reset_index(drop=True)
df['microorganisms'].value_counts()


In [None]:
print(f'DATASET INFORMATON AFTER REDUCING DUBLICATE VALUES ')
print(f'Dataset shape: {df.shape}')
print()
print('Dataset head:')
df.head(10)


In [None]:
df.tail(10)


# Data Preprocessing

In [None]:
# Correlations 
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')


In [None]:
encoder = LabelEncoder()
df['microorganisms'] = encoder.fit_transform(df['microorganisms'])


# Data Visualization


In [None]:
# Function for finding correlation.

def corr_map(feature, size=((10, 7.0))):  
  # Figure size
  plt.figure(figsize=size)
  sns.set_context('poster', font_scale= 1)

  # Histogram
  sns.histplot(data=df, x=feature, hue='microorganisms', binwidth=1, kde=True)

  # Aesthetics
  plt.title(f'{feature} distribution')
  plt.xlabel(f'{feature} Value')


In [None]:
corr_map('Solidity')


In [None]:
corr_map('raddi')


In [None]:
# Eccentricity
corr_map('Eccentricity', (15, 9))


In [None]:
#EquivDiameter
corr_map('EquivDiameter', (7, 7))


In [None]:
# heatmap graph for finding  correlation of column 
df.corr()
plt.figure(figsize=(29,15))
sns.set_context('poster', font_scale=0.5)
sns.heatmap(df.corr(), cmap='coolwarm', annot=True)
#plt.title('R')
plt.show()


# Baseline Models


In [None]:
# Note: Other features will be correlated soon
#Section 2: Baseline Models
#Using a simple model for having a baseline accuracy without removing any features.

# Importing libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Copying dataset for testing baseline
baseline_data = df

# Separating into training and testing set
target = 'microorganisms'
y = baseline_data[target]
X = baseline_data.drop([target], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)


In [None]:
# Making the model
baseline_model = DecisionTreeClassifier(criterion='entropy', random_state=1)
baseline_model.fit(X_train, y_train)

# Accuracy of the model
baseline_model.score(X_test, y_test)


In [None]:
# redistribute the unbalanced data with over sampling
from scipy import stats
from imblearn.over_sampling import SMOTE

from sklearn.metrics import confusion_matrix

# display progress of loops
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score


In [None]:
df.iloc[:,13:].describe()


In [None]:
# Normalize the Data
scaler = StandardScaler()
sc_df = scaler.fit_transform(df.iloc[:,:-1])#  target coulumn is microorganism
temp_y = df['microorganisms'].to_numpy()
sc_df = pd.DataFrame(np.hstack((sc_df, temp_y.reshape(-1, 1))), columns=df.columns)
sc_df.head()



In [None]:
sc_df = sc_df[(np.abs(stats.zscore(df.iloc[:,:-1])) < 3).all(axis=1)]
sc_df.info()


In [None]:
X = sc_df.drop(['microorganisms'], axis=1).to_numpy()
y = sc_df['microorganisms'].to_numpy()
sm = SMOTE(random_state=42)
X_train_transformed, y_train_transformed = sm.fit_resample(X, y)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=42 , shuffle=True)


In [None]:
# format scoring output
def score_format(model):
    print(f'Accuracy: {round(model * 100, 2)} %')

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(f'K-Nearest Neighbors')
score_format(knn.score(X_test, y_test))


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(f'Random Forest')
score_format(rf.score(X_test, y_test))


In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print(f'*Gradient Boosting')
score_format(gb.score(X_test, y_test))


In [None]:
mlp = MLPClassifier(max_iter = 1000)
mlp.fit(X_train, y_train)
score_format(mlp.score(X_test, y_test))


In [None]:
models = [knn, rf, gb, mlp]


# Cross Validation


In [None]:
for model in models:
    scores = cross_val_score(estimator=model,
                         X=X,
                         y=y,
                         cv=10,
                         n_jobs=-1)
    print(f'CV accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')


 # Hyperparameter Tuning


In [None]:
# Hyperparameter Tuning
ints_ = [5,7,9, None]
param_grid = [{'n_neighbors': ints_[:-1], 
               'weights': ['uniform', 'distance']
              },
              {'criterion': ['gini', 'entropy'],
               'max_depth': ints_
              },
              {'max_depth': ints_[:-1]},
              {'hidden_layer_sizes': [(100,), (250,)], 
               'activation': ['logistic', 'tanh', 'relu'],
               'max_iter': [10000]
              }]

tuned_models = []
for i in tqdm(range(len(models))):    
    gs = GridSearchCV(models[i], 
                      param_grid[i], 
                      n_jobs = -1,
                      refit = True,
                      cv = 10,
                      return_train_score = True)
    best_model = gs.fit(X, y)
    tuned_models.append(best_model)


In [None]:
best_model

In [None]:
for model in tuned_models:
    print(f'Best Params: {model.best_estimator_}')
    print(f'Best Score: {model.best_score_:.3f}')
    print()
    print()
