Make necessary imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from kneed import KneeLocator
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import seaborn as sns

File reading

In [None]:
raw_data = pd.read_csv("/content/forest_cover.csv")

**Handling missing values**

In [None]:
raw_data.isnull().sum()

In [None]:
raw_data = raw_data.dropna()

Plotting numerical data

In [None]:
num_data = raw_data[['elevation', 'aspect', 'slope', 'horizontal_distance_to_hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points']]

In [None]:
plt.figure(figsize = (15,60), facecolor = 'white')
plotnumber = 1

for column in num_data:
  plt.subplot(16,4, plotnumber)
  sns.distplot(num_data[column])
  plt.xlabel(column, fontsize = 10)
  plotnumber+=1

plt.show()


Data transformation

In [None]:
scaler = StandardScaler()

In [None]:
scaled_data = scaler.fit_transform(num_data)

In [None]:
col = num_data.columns

In [None]:
scaled_data = pd.DataFrame(scaled_data, columns = col)

Plotting categorical data

In [None]:
cat_data = raw_data.drop(col, axis = 1)

In [None]:
cat_data['class'].unique()

In [None]:
cat_data['class'] = cat_data['class'].map({'Lodgepole_Pine':0, 'Spruce_Fir':1, 'Douglas_fir':2, 'Krummholz':3,
       'Ponderosa_Pine':4, 'Aspen':5, 'Cottonwood_Willow':6})

In [None]:
plt.figure(figsize=(15,60), facecolor = 'white')
plotnumber = 1
for column in cat_data:
  plt.subplot(16,4,plotnumber)
  sns.violinplot(data = cat_data, x=cat_data[column], y = cat_data['class'])
  plt.xlabel(column, fontsize = 10)
  plotnumber += 1

plt.show()
plt.figure(figsize = (15,60), facecolor = 'white')
plotnumber = 1


In [None]:
plt.figure(figsize = (5,5))
sns.countplot(cat_data['class'])

In [None]:
final_data = pd.concat([scaled_data,cat_data], axis = 1)

Separating label and features

In [None]:
X= final_data.drop('class', axis = 1)

In [None]:
y = final_data['class']

Handling imbalance dataset

In [None]:
sample = SMOTE()

In [None]:
X,y = sample.fit_resample(X,y)

In [None]:
plt.figure(figsize = (5,5))
sns.countplot(y)