In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from mlxtend.plotting import plot_decision_regions
from scipy.stats import uniform, randint
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer, f1_score
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

Reading the data

In [None]:
df=pd.read_csv("Group_4_data_cleaned.csv",index_col=0)
df.head()

In [None]:
print(df.describe())
print(df.info())

In [None]:
output_dict={1:'text',2:'horizontal line',3:'picture',4:'vertical line',5:'graphic'}
df["y"].value_counts().rename(index=output_dict)

Here we can observe that the 'text' type is more compared with other classes which needs to be classified. If we predcit all the classes as text we get an accuracy of 89.7% for the above dataset.

In [None]:
ax=sns.countplot(df['y'])
ax.set(xticklabels=output_dict.values())
plt.show()

## Model Building

In [None]:
X=df.iloc[:, :-1]
y=df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Diving the dataset to train and test with 80 an 20 percent ratio

In [None]:
# Scale the features using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

We are scaling the features for the input dataframe using MinMaxScaler

In [None]:
y_train.value_counts()

Here we can observe there a class imbalance. This can hamper our model accuracy big time. To deal with this problem we can either do oversampling or undersampling. If we do undersampling our train data will be reduced to 90 rows. Which will be very less for model building. Hence we choose oversampling the data.

In [None]:
oversampler = RandomOverSampler(random_state=42)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train_scaled, y_train)

In [None]:
y_train_oversampled.value_counts()

Now we can observe that all classes have same number of rows. This is done using RandomOverSampler from the imblearn library.

## We have choosed Support Vector Machines for our model

In [None]:
svm = SVC()


params= {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']
             } 

In [None]:
random_search = RandomizedSearchCV(svm, 
                            params,
                            cv = 5,
                            n_jobs=-1,
                            refit = True, 
                            verbose = 3)
random_search.fit(X_train_oversampled, y_train_oversampled)

In [None]:
print(f"The best hyperparameters: {random_search.best_params_}")
print(f"The best score: {random_search.best_score_}")

Performing RandomizedSearchCV on the train data which gives the best n value, max_features and the max_depth.

In [None]:
print(random_search.best_estimator_)

In [None]:
SVM = SVC(kernel=random_search.best_params_['kernel'], C=random_search.best_params_['C'], gamma=random_search.best_params_['gamma'])
SVM.fit(X_train_oversampled, y_train_oversampled)

In [None]:
y_pred=SVM.predict(X_test_scaled)

In [None]:
accuracy_score(y_test,y_pred)

## Model Evaluation

In [None]:
cv_scores = cross_val_score(SVM, 
                            X_train_oversampled, 
                            y_train_oversampled, 
                            cv=KFold(n_splits=10, shuffle=True),
                            scoring='accuracy')

cv_scores

In [None]:
plt.plot(range(1, 11), cv_scores, marker='o')
plt.ylim(0, 1)
plt.ylabel('Accuracy score', fontsize=15)
plt.xlabel('Fold', fontsize=15)
plt.xticks(range(1,11))
print("Average Accuracy",cv_scores.mean())
print("Standard deviation of Accuracy",cv_scores.std())

## Confusion Matrix

In [None]:
cm=confusion_matrix(y_test, y_pred)
cm

In [None]:
plt.figure(figsize = (8,6))

sns.heatmap(cm, 
            annot=True, 
            cmap="Spectral")
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('Actual/Observed', fontsize=15);

## Classification report

In [None]:
print(classification_report(y_test, y_pred))