# Categorization Models

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn import datasets, model_selection, metrics
from sklearn import linear_model, naive_bayes, tree
from sklearn.metrics import roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


# Importing the Data
df = pd.read_csv("Post_EDA_encoded_df.csv")
df.head()

Unnamed: 0,Sentiment_Score,Is_Response,Has_WebLink,Email_Length,accentChar_Count,accentChar_percentage,Hyphen_Count,Pound_Count,At_Count,Exclamation_Count,Question_Count,Period_Count,Exclamation_Percentage,Question_Percentage,Period_Percentage,Hyphen_Percentage,Pound_Percentage,At_Percentage,Capitalization_Percent,Safe_Email
0,0.9798,True,False,1030,0,0.0,1,0,0,2,0,7,0.194175,0.0,0.679612,0.097087,0.0,0.0,0.0,1
1,0.4329,False,False,479,0,0.0,0,0,1,0,2,4,0.0,0.417537,0.835073,0.0,0.0,0.208768,0.0,1
2,0.8591,True,False,1245,0,0.0,51,2,3,0,1,6,0.0,0.080321,0.481928,4.096386,0.160643,0.240964,0.0,1
3,0.964,False,True,688,0,0.0,14,0,0,1,1,36,0.145349,0.145349,5.232558,2.034884,0.0,0.0,9.375,0
4,0.0534,False,False,441,0,0.0,0,0,0,0,0,13,0.0,0.0,2.947846,0.0,0.0,0.0,0.0,0


Standardization (or scaling) of data is a crucial preprocessing step. It does not change the underlying relationships between the features, but it scales them for comparability. I was having issues with convergence until the data was scaled.

In [5]:
from sklearn.preprocessing import StandardScaler
# Scale the data
X = df.drop(['Safe_Email'], axis=1)
y = df['Safe_Email']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the Data, Training versus Testing
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_scaled, y, test_size=0.3)

X_train_pct = round((len(X_train) / (len(X_train) + len(X_test))), 2)
X_test_pct = round((len(X_test) / (len(X_train) + len(X_test))), 2)

print('Classification training dataset shape is:', X_train.shape, 'Testing dataset shape is:', X_test.shape)
print('Regression train/test split is:', X_train_pct, '/', X_test_pct)

Classification training dataset shape is: (12670, 19) Testing dataset shape is: (5431, 19)
Regression train/test split is: 0.7 / 0.3


## Logistic Regression

In [6]:
logistic_model = linear_model.LogisticRegression()

# Train the model
logistic_model.fit(X_train,y_train)
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

Accuracy: 0.7534524028723992
Precision: 0.7315886984715146
Recall: 0.9460916442048517
F1 Score: 0.8251273344651953


## Models From the Categorization Example on Blackboard

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import itertools

In [8]:
param_dict = {
    'knn' : ('n_neighbors', [2, 3, 4, 5, 6, 7, 8], 'weights', ['uniform', 'distance']),
    'svc' : ('C', [1, 2], 'kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
    'dt' : ('criterion', ['gini', 'entropy', 'log_loss'], 'max_depth', [2, 3, 4, 5, 6]),
    'rf' : ('n_estimators', [50, 100, 150, 200], 'max_depth', [2, 3, 4, 5, 6]),
    'mlp' : ('activation', ['identity', 'logistic', 'tanh', 'relu'], 'solver', ['lbfgs', 'sgd', 'adam']),
    }

model_dict = {
    'knn' : KNeighborsClassifier(),
    'svc' : SVC(),
    'dt' : DecisionTreeClassifier(),
    'rf' : RandomForestClassifier(),
    'mlp' : MLPClassifier(),
}

### Running the Models

In [9]:
summary = []
cm_df = pd.DataFrame()

for i in ['knn', 'svc', 'dt', 'rf', 'mlp']:
    param_grid = list(itertools.product(param_dict[i][1], param_dict[i][3]))
    for k in range(len(param_grid)):
        param_grid_dict = {param_dict[i][0] : param_grid[k][0], param_dict[i][2] : (param_grid[k][1])}
        model = model_dict[i]
        model.set_params(**param_grid_dict)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        summary.append([i, param_dict[i][0], param_grid[k][0], param_dict[i][2], param_grid[k][1],
            accuracy_score(y_test,y_pred), precision_score(y_test, y_pred, average='macro'), 
            recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro'),
        ])
        dfpred = pd.DataFrame([y_pred]).transpose()
        dftest = pd.DataFrame([y_test]).transpose()
        cm_data_temp = pd.concat([dfpred, dftest], axis=1)
        cm_df = pd.concat([cm_df, cm_data_temp], axis=0)
summary_df = pd.DataFrame(summary, columns=('model', 'param1', 'parval1', 'param2', 'parval2', 'accuracy', 'precision', 'recall', 'f1'))
best_df = summary_df.nlargest(15,'f1')
cm_df.columns = ('ypred', 'ytest')
#cm_df
best_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Unnamed: 0,model,param1,parval1,param2,parval2,accuracy,precision,recall,f1
66,mlp,activation,relu,solver,lbfgs,0.846621,0.840876,0.832506,0.836166
11,knn,n_neighbors,7,weights,distance,0.843123,0.837408,0.828322,0.832245
13,knn,n_neighbors,8,weights,distance,0.841834,0.836399,0.826381,0.830644
63,mlp,activation,tanh,solver,lbfgs,0.841466,0.835698,0.826439,0.830421
9,knn,n_neighbors,6,weights,distance,0.839993,0.833757,0.82542,0.829051
7,knn,n_neighbors,5,weights,distance,0.837599,0.830668,0.823741,0.826819
65,mlp,activation,tanh,solver,adam,0.838335,0.832804,0.822376,0.826775
60,mlp,activation,logistic,solver,lbfgs,0.836678,0.832542,0.818529,0.824143
5,knn,n_neighbors,4,weights,distance,0.832996,0.824765,0.8208,0.822646
3,knn,n_neighbors,3,weights,distance,0.832628,0.824219,0.820769,0.822388


### Confusion Matrix

In [None]:
cm_matrix = confusion_matrix(cm_df['ytest'], cm_df['ypred'])
print(cm_matrix)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))

ax = sns.heatmap(cm_matrix, annot=True, cmap='Greens', fmt='.2f')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['0','1','2'])
ax.yaxis.set_ticklabels(['0','1','2'])

## Display the visualization of the Confusion Matrix.
plt.show()